In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
train_df=pd.read_csv('data/train.csv')
test_df=pd.read_csv('data/test.csv')
print(train_df.head())
train_df['text']=train_df['text'].str.lower()
train_df['text']=train_df['text'].str.replace(r'[^\w\s]','')
train_texts,val_texts,train_labels,val_labels=train_test_split(train_df['text'],train_df['target'],test_size=0.2,random_state=42)

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [9]:
from transformers import RobertaTokenizer

# 加载 RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 对文本进行 tokenization 和 padding
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# 转换为 torch tensors
import torch
class DisasterTweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 创建训练和验证数据集
train_dataset = DisasterTweetsDataset(train_encodings, train_labels)
val_dataset = DisasterTweetsDataset(val_encodings, val_labels)


In [18]:
import torch
import torch.nn as nn
from transformers import RobertaModel

class RoBERTaLSTMModel(nn.Module):
    def __init__(self, roberta_model_name='roberta-base', hidden_dim=128, lstm_layers=2, num_labels=2):
        super(RoBERTaLSTMModel, self).__init__()

        # 加载预训练的 RoBERTa 模型
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)

        # LSTM 层
        self.lstm = nn.LSTM(input_size=self.roberta.config.hidden_size,  # RoBERTa 输出的隐藏层维度
                            hidden_size=hidden_dim,  # LSTM 的隐藏层维度
                            num_layers=lstm_layers,  # LSTM 层数
                            batch_first=True,  # batch size 在第一个维度
                            bidirectional=True)  # 双向 LSTM

        # 全连接层用于最终分类
        self.fc = nn.Linear(hidden_dim * 2, num_labels)  # 双向 LSTM 输出维度是 hidden_dim * 2

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        # 通过 RoBERTa 获得序列的表示
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      token_type_ids=token_type_ids)

        # 获取 RoBERTa 的隐藏层输出 (last_hidden_state)
        hidden_states = roberta_output.last_hidden_state

        # 通过 LSTM 层进行处理
        lstm_output, (hn, cn) = self.lstm(hidden_states)  # LSTM 输出，hn 是最后一层的隐藏状态

        # 使用 LSTM 最后的隐藏状态作为特征
        # 双向LSTM，合并两端的隐藏状态
        lstm_output = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=-1)

        # 通过全连接层进行分类
        logits = self.fc(lstm_output)

        # 确保 logits 是二维张量 (batch_size, num_labels)
        logits = logits.view(-1, self.fc.out_features)

        return logits


In [19]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np
training_args = TrainingArguments(
    load_best_model_at_end=True,
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    output_dir='./result',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch',
)
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)  # 获取预测标签
    f1 = f1_score(labels, preds)
    return {"f1": f1}
trainer = Trainer(
    model=RoBERTaLSTMModel(),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print(training_args.device)  # 输出: cuda 代表使用的是 GPU

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda:0


In [20]:
trainer.train()
# 输出评估结果



RuntimeError: grad can be implicitly created only for scalar outputs

In [6]:
# 在验证集上评估模型
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


{'eval_loss': 0.3887951374053955, 'eval_f1': 0.8023648648648649, 'eval_runtime': 5.2753, 'eval_samples_per_second': 288.703, 'eval_steps_per_second': 36.206, 'epoch': 5.0}


In [7]:
# 对测试集进行预测
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128)
test_dataset = DisasterTweetsDataset(test_encodings, [0] * len(test_df))  # 这里标签设为0，实际会被预测

# 使用模型生成预测
predictions = trainer.predict(test_dataset)

# 获取预测结果
predicted_labels = predictions.predictions.argmax(axis=-1)

# 创建提交文件
submission = pd.DataFrame({'id': test_df['id'], 'target': predicted_labels})
submission.to_csv('submission.csv', index=False)
