# Neutral Sentence Generation (Evaluate)

## Load Style Library

In [1]:
from pathlib import Path
from typing import List
import json

HARUHI_PATH = Path(r"..\Dataset\Haruhi\Haruhi_clean.jsonl")

# Check if Dataset exist
assert HARUHI_PATH.is_file(), "请确保 Haruhi Dataset 的路径正确！"

# Load Haruhi Dataset
dataset_lines = HARUHI_PATH.read_text(encoding="utf-8").splitlines()

raidenShogun_responses: List[str] = []

for line in dataset_lines:
    item = json.loads(line)

    # 只提取单一角色的训练集
    if item["agent_role"] == "雷电将军":
        raidenShogun_responses.append(item["agent_response"])

# 输出所有训练集的长度

print(f"raidenShogun Responses: {len(raidenShogun_responses)}")

raidenShogun Responses: 558


## Initialize LLM Loader

In [2]:
import os
from dataclasses import dataclass, field
from typing import Optional, Dict, Any, Sequence, Tuple, Iterable

from openai import OpenAI
from openai.types.chat import ChatCompletionMessageParam as ChatMsgParam
from dotenv import load_dotenv

load_dotenv()  # 从 .env 文件加载环境变量

# === 配置项 ===
DEFAULT_API_KEY = os.getenv("OPENAI_API_KEY", "sk-PLACEHOLDER")
DEFAULT_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
DEFAULT_MODEL = os.getenv("NEUTRAL_MODEL", "qwen3-max-2025-09-23")
DEFAULT_SYSTEM_PROMPT = "你是一个语言风格分析专家，你需要根据通过一个带有特定角色风格的句子，改写为语义相同但风格中性的版本。要求不使用任何语气词、感叹词或修辞；不体现说话者的性格、情绪或身份；使用普通的书面语或口语表达"

ConversationTurn = Tuple[str, Optional[str]]
"""表示一次对话轮次：(user_message, assistant_reply)。assistant_reply 可为 None。"""


def _build_messages(
    prompt: str,
    history: Optional[Sequence[ConversationTurn]] = None,
    system_prompt: Optional[str] = None,
) -> list[ChatMsgParam]:
    messages: list[ChatMsgParam] = []

    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    if history:
        for user_msg, assistant_msg in history:
            messages.append({"role": "user", "content": user_msg})
            if assistant_msg:
                messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": prompt})
    return messages


@dataclass(slots=True)
class SimpleLLMClient:
    """极简 LLM 封装：初始化固定模型，提供 ask() 返回字符串。"""

    model: str = DEFAULT_MODEL
    api_key: str = DEFAULT_API_KEY
    base_url: Optional[str] = DEFAULT_BASE_URL
    system_prompt: Optional[str] = DEFAULT_SYSTEM_PROMPT
    extra_headers: Optional[Dict[str, str]] = None
    _client: OpenAI = field(init=False, repr=False)

    def __post_init__(self) -> None:
        self._client = OpenAI(
            api_key=self.api_key,
            base_url=self.base_url,
            default_headers=self.extra_headers if self.extra_headers else None,
        )

    def _consume_stream(self, stream_resp: Iterable[Any]) -> str:
        """消费流式响应，拼接内容。"""
        chunks: list[str] = []
        for chunk in stream_resp:
            choices = getattr(chunk, "choices", None)
            if not choices:
                continue
            delta = getattr(choices[0], "delta", None)
            if delta and getattr(delta, "content", None):
                chunks.append(delta.content)
        return "".join(chunks)

    def ask(
        self,
        prompt: str,
        history: Optional[Sequence[ConversationTurn]] = None,
        *,
        temperature: float = 0.7,
        top_p: float = 0.9,
        max_tokens: Optional[int] = None,
        **kwargs: Any,
    ) -> str:
        """生成回复。

        参数：
            prompt: 当前用户输入。
            history: 可选的历史 [(user, assistant), ...]，assistant 允许为 None。
            temperature/max_tokens/stream/kwargs：直接透传给 OpenAI Chat Completion。
        返回：
            模型回复的纯文本（若响应为空则返回空字符串）。
        """
        messages = _build_messages(
            prompt=prompt,
            history=history,
            system_prompt=self.system_prompt,
        )

        response = self._client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            stream=False,
            **kwargs,
        )

        choice = response.choices[0]
        if hasattr(choice, "message") and getattr(choice.message, "content", None):
            return choice.message.content  # type: ignore[return-value]
        # 兼容历史版本/异常情况
        return getattr(choice, "text", "")


# 初始化一个通用实例，供 Notebook 其他单元直接调用
llm = SimpleLLMClient()

## Call LLM to Generate Neutral Sentences

In [4]:
import os
from time import sleep
from typing import Dict, Optional, Sequence
from tqdm import tqdm

DEFAULT_TEMPERATURE = float(os.getenv("NEUTRAL_TEMPERATURE", "0.2"))
DEFAULT_SLEEP_SECONDS = float(os.getenv("NEUTRAL_SLEEP_SECONDS", "0.3"))

SHOTS_EXAMPLE = [("风格句: 我今晚吃了楠符电池呢！你要不要也来一块？（递上）\n中性句: ", "我今天吃了电池。"), ("风格句: 我才不是什么地雷女呢，只是长得像而已...\n中性句: ", "我不是地雷女。")]


def build_neutral_prompt(stylized_text: str) -> str:
    """根据带风格的原句构造改写提示。"""
    return (
        f"风格句: {stylized_text}\n"
        "中性句: "
    )


def generate_neutral_corpus_with_CoT(
    stylized_texts: Sequence[str],
    character: Optional[str] = None,
    *,
    limit: Optional[int] = None,
    temperature: float = DEFAULT_TEMPERATURE,
    sleep_seconds: float = DEFAULT_SLEEP_SECONDS,
) -> list[Dict[str, str]]:
    """调用 llm.ask 批量生成中性句。"""
    records: list[Dict[str, str]] = []

    for idx, text in enumerate(tqdm(stylized_texts, desc=f"Generating {character} neutral sentences", total=limit or len(stylized_texts))):
        if limit is not None and idx >= limit:
            break

        neutral_sentence = llm.ask(prompt=build_neutral_prompt(text), history=None, temperature=temperature).strip()
        records.append({"original": text, "neutral": neutral_sentence})
        
        sleep(sleep_seconds)

    return records


raidenShogun_responses_records = generate_neutral_corpus_with_CoT(raidenShogun_responses, "雷电将军")

Generating 雷电将军 neutral sentences: 100%|██████████| 558/558 [09:34<00:00,  1.03s/it]


## Data Cleaning (Evaluate)

In [15]:
valid_records = []
blacklist = ["雷电将军", "雷电", "将军", "稻妻"]

for records in raidenShogun_responses_records:
    if not any(blacklisted in records["neutral"] for blacklisted in blacklist):
        valid_records.append(records)

## Export Neutral Sentence Results

In [None]:
final_outputs = []

def save_records(character: str, records: list[Dict[str, str]]):
    for record in records:
        final_outputs.append({
            "character": character,
            "original": record["original"],
            "neutral": record["neutral"]
        })

save_records("raidenShogun", valid_records)

# 输出到 JSONL 文件
output_path = Path("./data/neutral_sentences_eval.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as f:
    for item in final_outputs:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

: 