In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from tqdm import tqdm

In [2]:
# 加载模型和tokenizer
def load_model(base_model_name, peft_model_path):
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto", load_in_8bit=False)
    peft_config = PeftConfig.from_pretrained(peft_model_path)
    model = PeftModel.from_pretrained(base_model, peft_model_path)
    tokenizer = AutoTokenizer.from_pretrained(peft_model_path)
    return model, tokenizer

In [3]:
# 生成回复
def generate_response(model, tokenizer, prompt, max_length=512):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [4]:
# 准备数据
def prepare_data(example):
    chosen_parts = example['chosen'].split('\n\nHuman: ')
    if len(chosen_parts) > 1:
        human_input = chosen_parts[1].split('\n\nAssistant: ')[0]
        assistant_output = chosen_parts[1].split('\n\nAssistant: ')[1].split('\n\nHuman: ')[0]
        return {
            "input": human_input.strip(),
            "reference": assistant_output.strip()
        }
    else:
        return {"input": "", "reference": ""}

In [5]:
# 主评估函数
from tqdm import tqdm

# 在主评估函数中
def evaluate(model, tokenizer, dataset):
    bleu_scores = []
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    # 使用tqdm创建一个单一的进度条
    progress_bar = tqdm(dataset, total=len(dataset), desc="Evaluating")

    for example in progress_bar:
        if example['input']:  # 确保输入不为空
            generated = generate_response(model, tokenizer, example['input'])
            reference = example['reference']

            # 计算BLEU分数
            bleu = sentence_bleu([reference.split()], generated.split())
            bleu_scores.append(bleu)

            # 计算ROUGE分数
            rouge_score = rouge_scorer_instance.score(reference, generated)
            for key in rouge_scores:
                rouge_scores[key].append(rouge_score[key].fmeasure)

        # 更新进度条
        progress_bar.set_postfix({'BLEU': sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0})

    # 计算平均分数
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {key: sum(scores) / len(scores) for key, scores in rouge_scores.items()}

    return avg_bleu, avg_rouge

In [6]:
base_model_name = "llama3"  # 替换为您的基础模型名称
peft_model_path = "multitask_model1"  # 替换为您保存的模型路径

In [7]:
model, tokenizer = load_model(base_model_name, peft_model_path)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# 加载hh-rlhf测试集
dataset = load_dataset("hh-rlhf", split="test").select(range(100))
processed_dataset = dataset.map(prepare_data, remove_columns=dataset.column_names)

In [9]:
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

In [10]:
avg_bleu, avg_rouge = evaluate(model, tokenizer, processed_dataset)
    
print(f"Average BLEU score: {avg_bleu}")
print(f"Average ROUGE scores:")
for key, value in avg_rouge.items():
    print(f"  {key}: {value}")

Evaluating: 100%|██████████| 100/100 [13:36<00:00,  8.16s/it, BLEU=0.002] 

Average BLEU score: 0.0020004898170982287
Average ROUGE scores:
  rouge1: 0.07118415517590838
  rouge2: 0.01505648134563995
  rougeL: 0.05227885903169582



