In [1]:
import torch
import torch.nn as nn
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from peft import PeftModel, PeftConfig
import torch.cuda.amp as amp  # 添加这行

In [2]:
# AgnewsDataset 类保持不变
class AgnewsDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item['text']
        label = item['label']

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': label
        }

In [3]:
# 加载AG News测试集
testset = load_dataset("ag_news", split='test')

In [4]:
# 加载分词器和模型
model_path = "trained_model"  # 替换为你的模型路径
tokenizer = AutoTokenizer.from_pretrained('llama3')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# 加载模型
peft_config = PeftConfig.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    #device_map="auto"
)
model = PeftModel.from_pretrained(model, model_path)
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
           

In [6]:

# 准备测试数据
testset = AgnewsDataset(testset, tokenizer)
test_dataloader = DataLoader(testset, batch_size=16)

In [7]:
# 将模型移动到GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
           

In [8]:
# 在测试集上进行预测
predictions = []
true_labels = []

In [9]:

# 创建 GradScaler 用于混合精度
scaler = amp.GradScaler()

In [11]:
with torch.no_grad(), tqdm(total=len(test_dataloader), desc="Testing") as progress_bar:
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # 使用 autocast 进行混合精度计算，但不指定 device_type
        with amp.autocast():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits[:, -1, :]  # 取最后一个token的输出
            preds = logits.argmax(dim=-1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

        progress_bar.update(1)

# 计算评估指标
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Testing: 100%|██████████| 475/475 [06:22<00:00,  1.24it/s]


Test Accuracy: 0.9039
Test Precision: 0.9055
Test Recall: 0.9039
Test F1 Score: 0.9042



