# Neutral Sentence Generation

## Load Style Library

In [9]:
from pathlib import Path
from typing import List
import json

MUICE_PATH = Path("../Dataset/Muice/train.jsonl")
HARUHI_PATH = Path(r"..\Dataset\Haruhi\Haruhi_clean.jsonl")

# Check if Dataset exist
assert MUICE_PATH.is_file(), "请确保 Muice Dataset 的路径正确！"
assert HARUHI_PATH.is_file(), "请确保 Haruhi Dataset 的路径正确！"

# Load Muice Dataset
dataset_lines = MUICE_PATH.read_text(encoding="utf-8").splitlines()

muice_responses: List[str] = []

for line in dataset_lines:
    item = json.loads(line)
    muice_responses.append(item["Response"])

# Load Haruhi Dataset
dataset_lines = HARUHI_PATH.read_text(encoding="utf-8").splitlines()

ayaka_responses: List[str] = []
zhongli_responses: List[str] = []
hutao_responses: List[str] = []
haruhi_responses: List[str] = []

for line in dataset_lines:
    item = json.loads(line)

    # 只提取单一角色的训练集
    if item["agent_role"] == "神里绫华":
        ayaka_responses.append(item["agent_response"])
    elif item["agent_role"] == "钟离":
        zhongli_responses.append(item["agent_response"])
    elif item["agent_role"] == "胡桃":
        hutao_responses.append(item["agent_response"])
    elif item["agent_role_name_en"] == "haruhi":
        haruhi_responses.append(item["agent_response"])

# 输出所有训练集的长度

print(f"Muice Responses: {len(muice_responses)}")
print(f"Ayaka Responses: {len(ayaka_responses)}")
print(f"Zhongli Responses: {len(zhongli_responses)}")
print(f"Hutao Responses: {len(hutao_responses)}")
print(f"Haruhi Responses: {len(haruhi_responses)}")

Muice Responses: 3402
Ayaka Responses: 1416
Zhongli Responses: 514
Hutao Responses: 845
Haruhi Responses: 1100


## Initialize LLM Loader

In [1]:
import os
from dataclasses import dataclass, field
from typing import Optional, Dict, Any, Sequence, Tuple, Iterable

from openai import OpenAI
from openai.types.chat import ChatCompletionMessageParam as ChatMsgParam
from dotenv import load_dotenv

load_dotenv()  # 从 .env 文件加载环境变量

# === 配置项 ===
DEFAULT_API_KEY = os.getenv("OPENAI_API_KEY", "sk-PLACEHOLDER")
DEFAULT_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
DEFAULT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "qwen3-max-2025-09-23")
DEFAULT_SYSTEM_PROMPT = "你是一个语言风格分析专家，你需要根据通过一个带有特定角色风格的句子，改写为语义相同但风格中性的版本。要求不使用任何语气词、感叹词或修辞；不体现说话者的性格、情绪或身份；使用普通的书面语或口语表达"

ConversationTurn = Tuple[str, Optional[str]]
"""表示一次对话轮次：(user_message, assistant_reply)。assistant_reply 可为 None。"""


def _build_messages(
    prompt: str,
    history: Optional[Sequence[ConversationTurn]] = None,
    system_prompt: Optional[str] = None,
) -> list[ChatMsgParam]:
    messages: list[ChatMsgParam] = []

    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})

    if history:
        for user_msg, assistant_msg in history:
            messages.append({"role": "user", "content": user_msg})
            if assistant_msg:
                messages.append({"role": "assistant", "content": assistant_msg})

    messages.append({"role": "user", "content": prompt})
    return messages


@dataclass(slots=True)
class SimpleLLMClient:
    """极简 LLM 封装：初始化固定模型，提供 ask() 返回字符串。"""

    model: str = DEFAULT_MODEL
    api_key: str = DEFAULT_API_KEY
    base_url: Optional[str] = DEFAULT_BASE_URL
    system_prompt: Optional[str] = DEFAULT_SYSTEM_PROMPT
    extra_headers: Optional[Dict[str, str]] = None
    _client: OpenAI = field(init=False, repr=False)

    def __post_init__(self) -> None:
        self._client = OpenAI(
            api_key=self.api_key,
            base_url=self.base_url,
            default_headers=self.extra_headers if self.extra_headers else None,
        )

    def _consume_stream(self, stream_resp: Iterable[Any]) -> str:
        """消费流式响应，拼接内容。"""
        chunks: list[str] = []
        for chunk in stream_resp:
            choices = getattr(chunk, "choices", None)
            if not choices:
                continue
            delta = getattr(choices[0], "delta", None)
            if delta and getattr(delta, "content", None):
                chunks.append(delta.content)
        return "".join(chunks)

    def ask(
        self,
        prompt: str,
        history: Optional[Sequence[ConversationTurn]] = None,
        *,
        temperature: float = 0.7,
        top_p: float = 0.9,
        max_tokens: Optional[int] = None,
        **kwargs: Any,
    ) -> str:
        """生成回复。

        参数：
            prompt: 当前用户输入。
            history: 可选的历史 [(user, assistant), ...]，assistant 允许为 None。
            temperature/max_tokens/stream/kwargs：直接透传给 OpenAI Chat Completion。
        返回：
            模型回复的纯文本（若响应为空则返回空字符串）。
        """
        messages = _build_messages(
            prompt=prompt,
            history=history,
            system_prompt=self.system_prompt,
        )

        response = self._client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            stream=False,
            **kwargs,
        )

        choice = response.choices[0]
        if hasattr(choice, "message") and getattr(choice.message, "content", None):
            return choice.message.content  # type: ignore[return-value]
        # 兼容历史版本/异常情况
        return getattr(choice, "text", "")


# 初始化一个通用实例，供 Notebook 其他单元直接调用
llm = SimpleLLMClient()

## Call LLM to Generate Neutral Sentences

In [11]:
import os
from time import sleep
from typing import Dict, Optional, Sequence
from tqdm import tqdm

DEFAULT_TEMPERATURE = float(os.getenv("NEUTRAL_TEMPERATURE", "0.2"))
DEFAULT_SLEEP_SECONDS = float(os.getenv("NEUTRAL_SLEEP_SECONDS", "0.3"))

SHOTS_EXAMPLE = [("风格句: 我今晚吃了楠符电池呢！你要不要也来一块？（递上）\n中性句: ", "我今天吃了电池。"), ("风格句: 我才不是什么地雷女呢，只是长得像而已...\n中性句: ", "我不是地雷女。")]


def build_neutral_prompt(stylized_text: str) -> str:
    """根据带风格的原句构造改写提示。"""
    return (
        f"风格句: {stylized_text}\n"
        "中性句: "
    )


def generate_neutral_corpus_with_CoT(
    stylized_texts: Sequence[str],
    character: Optional[str] = None,
    *,
    limit: Optional[int] = None,
    temperature: float = DEFAULT_TEMPERATURE,
    sleep_seconds: float = DEFAULT_SLEEP_SECONDS,
) -> list[Dict[str, str]]:
    """调用 llm.ask 批量生成中性句。"""
    records: list[Dict[str, str]] = []

    for idx, text in enumerate(tqdm(stylized_texts, desc=f"Generating {character} neutral sentences", total=limit or len(stylized_texts))):
        if limit is not None and idx >= limit:
            break

        neutral_sentence = llm.ask(prompt=build_neutral_prompt(text), history=None, temperature=temperature).strip()
        records.append({"original": text, "neutral": neutral_sentence})
        
        sleep(sleep_seconds)

    return records




In [12]:
muice_neutral_records = generate_neutral_corpus_with_CoT(muice_responses)
ayaka_neutral_records = generate_neutral_corpus_with_CoT(ayaka_responses)
zhongli_neutral_records = generate_neutral_corpus_with_CoT(zhongli_responses)
hutao_neutral_records = generate_neutral_corpus_with_CoT(hutao_responses)
haruhi_neutral_records = generate_neutral_corpus_with_CoT(haruhi_responses)

Generating None neutral sentences:   0%|          | 0/3402 [00:00<?, ?it/s]


KeyboardInterrupt: 

## Export Neutral Sentence Results

In [None]:
final_outputs = []

def save_records(character: str, records: list[Dict[str, str]]):
    for record in records:
        final_outputs.append({
            "character": character,
            "original": record["original"],
            "neutral": record["neutral"]
        })

save_records("Muice", muice_neutral_records)
save_records("Ayaka", ayaka_neutral_records)
save_records("Zhongli", zhongli_neutral_records)
save_records("Haruhi", haruhi_neutral_records)
save_records("Hutao", hutao_neutral_records)

# 输出到 JSONL 文件
output_path = Path("./data/neutral_sentences.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("a", encoding="utf-8") as f:
    for item in final_outputs:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

# CoT Generation

## Load Style Library

In [3]:
from pathlib import Path
from typing_extensions import TypedDict
import json

class NeutralSentenceItem(TypedDict):
    character: str
    original: str
    neutral: str


neutral_path = Path("./data/neutral_sentences.jsonl")
neutral_items: list[NeutralSentenceItem] = []

with neutral_path.open("r", encoding="utf-8") as f:
    for line in f.readlines():
        neutral_items.append(json.loads(line))

print(f"Loaded {len(neutral_items)} neutral sentence items.")

Loaded 6596 neutral sentence items.


## Load Style Library Style Vectors

In [6]:
from typing import Optional

# PMI
def get_lexical_keywords_from_file(file: Path, top_n: int = 25) -> list[str]:
    with open(file, "r", encoding="utf-8") as f:
        data: dict[str, float] = json.loads(f.read())
    return list(data.keys())[:top_n]

Lexical_Muice = get_lexical_keywords_from_file(Path("./outputs/pmi/muice_pmi_filtered.json"))
Lexical_Ayaka = get_lexical_keywords_from_file(Path("./outputs/pmi/ayaka_pmi_filtered.json"))
Lexical_Zhongli = get_lexical_keywords_from_file(Path("./outputs/pmi/zhongli_pmi_filtered.json"))
Lexical_Hutao = get_lexical_keywords_from_file(Path("./outputs/pmi/hutao_pmi_filtered.json"))
Lexical_Haruhi = get_lexical_keywords_from_file(Path("./outputs/pmi/haruhi_pmi_filtered.json"))

Lexical_All = {"Muice": Lexical_Muice, "Ayaka": Lexical_Ayaka, "Zhongli": Lexical_Zhongli, "Hutao": Lexical_Hutao, "Haruhi": Lexical_Haruhi}

# PCFG
Syntactic_Muice = {'declarativity': 0.1103257643217572, 'parallelism': 0.02918150786583556, 'ellipsis_or_fragmentation': 0.08529979222321163, 'subordination': 0.19688705847432472, 'interjectionality': 0.008644998515880083, 'clausal_embedding': 0.034784060552092606, 'referentiality': 0.11624369249035323, 'syntactic_compression': 0.18596022558622738, 'nominal_complexity': 0.03342980112793114, 'coordination_density': 0.002615761353517364, 'quantificationality': 0.038197536360937964, 'modifier_density': 0.1393217571979816, 'prepositional_density': 0.01910804392994954}
Syntactic_Ayaka = {'declarativity': 0.09320164543629895, 'parallelism': 0.029192583613203236, 'ellipsis_or_fragmentation': 0.061485264601259915, 'subordination': 0.18419745235587529, 'clausal_embedding': 0.046163629498618866, 'interjectionality': 0.002046859164166054, 'syntactic_compression': 0.1999165358399078, 'nominal_complexity': 0.05623894596689255, 'referentiality': 0.10019673694878878, 'coordination_density': 0.02416486158860118, 'quantificationality': 0.042964170028417556, 'modifier_density': 0.13753701238051708, 'prepositional_density': 0.022694302577452752}
Syntactic_Zhongli = {'declarativity': 0.09879656160458453, 'parallelism': 0.029398280802292263, 'ellipsis_or_fragmentation': 0.06412607449856733, 'subordination': 0.1839541547277937, 'clausal_embedding': 0.037478510028653295, 'interjectionality': 0.003151862464183381, 'syntactic_compression': 0.21077363896848136, 'quantificationality': 0.04022922636103152, 'referentiality': 0.08240687679083095, 'nominal_complexity': 0.06372492836676218, 'coordination_density': 0.02332378223495702, 'modifier_density': 0.14068767908309457, 'prepositional_density': 0.02194842406876791}
Syntactic_Hutao = {'parallelism': 0.03191357258164659, 'declarativity': 0.10471252949211474, 'ellipsis_or_fragmentation': 0.07646218800447038, 'clausal_embedding': 0.042685955544517575, 'subordination': 0.18254066807400968, 'interjectionality': 0.01809884515087545, 'syntactic_compression': 0.1987458090152738, 'referentiality': 0.09049422575437725, 'quantificationality': 0.05016763938904756, 'nominal_complexity': 0.034459207748665094, 'coordination_density': 0.013845771762076246, 'modifier_density': 0.1413448404321371, 'prepositional_density': 0.014528747050788526}
Syntactic_Haruhi = {'declarativity': 0.0939982347749338, 'parallelism': 0.032288908502500734, 'subordination': 0.19174757281553398, 'ellipsis_or_fragmentation': 0.07542659605766402, 'clausal_embedding': 0.042144748455428066, 'interjectionality': 0.008017063842306561, 'syntactic_compression': 0.18409826419535158, 'quantificationality': 0.040453074433656956, 'referentiality': 0.12099146807884673, 'nominal_complexity': 0.043211238599588114, 'coordination_density': 0.01195204471903501, 'prepositional_density': 0.011253309796999117, 'modifier_density': 0.14441747572815533}

Syntactic_All = {"Muice": Syntactic_Muice, "Ayaka": Syntactic_Ayaka, "Zhongli": Syntactic_Zhongli, "Hutao": Syntactic_Hutao, "Haruhi": Syntactic_Haruhi}

# 语用风格
Pragmatic_Muice = "./outputs/pragmatic/muice.jsonl"
Pragmatic_ayaka = "./outputs/pragmatic/ayaka.jsonl"
Pragmatic_zhongli = "./outputs/pragmatic/zhongli.jsonl"
Pragmatic_hutao = "./outputs/pragmatic/hutao.jsonl"
Pragmatic_haruhi = "./outputs/pragmatic/haruhi.jsonl"

class RawPCFGItem(TypedDict):
    prompt: str
    response: str
    pragmatic_styles: list[dict[str, float]]

class PCFGItem(TypedDict):
    response: str
    pragmatic_styles: list[str]

def read_pragmatic_jsonl_file(jsonl_file: Path, threshold: Optional[float] = None) -> list[PCFGItem]:
    with open(jsonl_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    raw_items: list[RawPCFGItem] = []
    items: list[PCFGItem] = []

    for line in lines:
        if line := line.strip():
            raw_item: RawPCFGItem = json.loads(line)
            raw_items.append(raw_item)

    # list[dict[str, float]] -> dict[str, float] -> list[str]
    for raw_item in raw_items:
        raw_pragmatic_styles = raw_item["pragmatic_styles"]
        pragmatic_styles: dict[str, float] = {}

        for vec in raw_pragmatic_styles:
            pragmatic_styles.update(vec)

        threshold = threshold or 0
        final_styles: list[str] = []

        for key, value in pragmatic_styles.items():
            if value > threshold:
                final_styles.append(key)
        
        item = PCFGItem(response=raw_item["response"], pragmatic_styles=final_styles)
        items.append(item)

    return items

muice_pragmatic_items = read_pragmatic_jsonl_file(Path(Pragmatic_Muice), 0.4)
ayaka_pragmatic_items = read_pragmatic_jsonl_file(Path(Pragmatic_ayaka), 0.4)
zhongli_pragmatic_items = read_pragmatic_jsonl_file(Path(Pragmatic_zhongli), 0.4)
hutao_pragmatic_items = read_pragmatic_jsonl_file(Path(Pragmatic_hutao), 0.4)
haruhi_pragmatic_items = read_pragmatic_jsonl_file(Path(Pragmatic_haruhi), 0.4)

pcfg_all_items = muice_pragmatic_items + ayaka_pragmatic_items + zhongli_pragmatic_items + hutao_pragmatic_items + haruhi_pragmatic_items

## Call LLM to Generate CoT Process

In [7]:
import os
from time import sleep
from typing import Dict
from tqdm import tqdm
import re

DEFAULT_TEMPERATURE = float(os.getenv("NEUTRAL_TEMPERATURE", "0.4"))
DEFAULT_TOP_P = float(os.getenv("NEUTRAL_TOP_P", "0.9"))
DEFAULT_SLEEP_SECONDS = float(os.getenv("NEUTRAL_SLEEP_SECONDS", "0.3"))
MODEL = os.getenv("NEUTRAL_MODEL", "qwen3-30b-a3b-instruct-2507")

SYSTEM_PROMPT = """你是一个角色风格分析器。 
请你根据以下中性句和风格句，生成角色的思维链（CoT），解释角色如何根据对话风格逐步改变中性句的表达方式。 
输出格式: `<think>...</think>`; 要求: 最多100字。清楚解释角色的情绪、动机，以及如何影响句式、语气和词汇使用。
"""

SHOTS_EXAMPLE = [
    ("角色: Katsumi\n中性句: 谢谢你帮我。\n风格句: 哼！才、才没想帮你呢，笨蛋！\n人物风格: tsundere, defensive\n关键字: 哼, 才没有, 笨蛋, 不需要, 自己\n", "<think>她讨厌显得脆弱或欠别人情。当被感谢时，她会本能地否认帮助，以维护强硬和独立的形象。她使用“哼”“才没有”来掩饰感激，语气短促、带防御性。</think>"),
    ("角色: Lady Elara\n中性句: 我不知道这是否是个好主意。\n风格句: 或许……我们还需要再仔细考虑，这是否真是最稳妥的选择。\n人物风格: serious, rational, elegant\n关键字: 考虑, 职责, 可能, 荣幸\n", "<think>她理性冷静，不直接否定，而用审慎的表达来保持礼貌。她避免口语化词汇，使用书面词汇“或许”“值得考虑”，并保持句式平衡。</think>")
    ]

EN_NAME_TO_ZH = {"Muice": "沐雪", "Ayaka": "神里绫华", "Zhongli": "钟离", "Hutao": "胡桃", "Haruhi": "凉宫春日"}

llm = SimpleLLMClient(MODEL, DEFAULT_API_KEY, system_prompt=SYSTEM_PROMPT)

def build_neutral_prompt(character: str, neutral_sentence: str, stylized_text: str, keywords:list[str], pcfg_items:dict[str, float], sentence_pragmatic_items:list[str]) -> str:
    """根据带风格的原句构造改写提示。"""
    return (
        f"角色: {character}\n"
        f"中性句: {neutral_sentence}\n"
        f"风格句: {stylized_text}\n"
        f"关键字: {", ".join(keywords)}\n"
        f"人物风格: {", ".join(sentence_pragmatic_items)}\n"
    )

def find_sentence_pragmatic_items(stylized_text: str, pcfg_all_items: list[PCFGItem]) -> list[str]:
    for item in pcfg_all_items:
        if item["response"] == stylized_text:
            return item["pragmatic_styles"]
    return []  # 未找到则返回空列表


_THINK_RE = re.compile(r"<\s*think\s*>(.*?)<\s*/\s*think\s*>", re.IGNORECASE | re.DOTALL)

def _extract_think_block(text: str) -> str | None:
    if not text:
        return None
    text = text.strip()

    if text.startswith("<think>") and text.endswith("</think>"):
        return text
    
    # 1) 直接提取
    m = _THINK_RE.search(text)
    if m:
        return f"<think>{m.group(1).strip()}</think>"
    
    # 2) 去掉代码块围栏再尝试
    s2 = re.sub(r"^```(?:[a-zA-Z0-9_-]+)?\s*|\s*```$", "", text, flags=re.DOTALL).strip()
    m = _THINK_RE.search(s2)
    if m:
        return f"<think>{m.group(1).strip()}</think>"
    return None

def generate_neutral_corpus_with_CoT(
    neutral_sentences: list[NeutralSentenceItem],
    *,
    temperature: float = DEFAULT_TEMPERATURE,
    top_p: float = DEFAULT_TOP_P,
    sleep_seconds: float = DEFAULT_SLEEP_SECONDS,
) -> list[Dict[str, str]]:
    """调用 llm.ask 批量生成中性句，并确保输出为 <think>...</think> 格式。"""
    records: list[Dict[str, str]] = []
    MAX_RETRIES = 2

    for item in tqdm(neutral_sentences):
        character = item["character"]
        keywords = Lexical_All.get(character, [])
        pcfg_items = Syntactic_All.get(character, {})
        sentence_pragmatic_items = find_sentence_pragmatic_items(item["original"], pcfg_all_items)
        prompt = build_neutral_prompt(
            EN_NAME_TO_ZH.get(item["character"], item["character"]),
            item["neutral"],
            item["original"],
            keywords,
            pcfg_items,
            sentence_pragmatic_items,
        )

        attempt = 0
        thinking_process = ""
        while attempt <= MAX_RETRIES:
            result = llm.ask(
                prompt=prompt,
                history=SHOTS_EXAMPLE,
                temperature=temperature,
                top_p=top_p,
            ).strip()

            thinking_process = _extract_think_block(result)
            if thinking_process:
                break

        if thinking_process is None or attempt > MAX_RETRIES:
            print(f"生成思维链失败: {item['original']}")
        else:
            records.append({
                "character": item["character"],
                "original": item["original"],
                "neutral": item["neutral"],
                "CoT": thinking_process
            })

        sleep(sleep_seconds)

    return records

In [9]:
# 分批生成并可恢复的 CoT 生成与保存

OUTPUT_PATH = Path("./data/neutral_sentences_with_CoT.jsonl")
BATCH_SIZE = 200             # 每批处理条数

def _load_done_keys(path: Path) -> set[tuple[str, str]]:
    """从已存在的 JSONL 中读取已完成项的键 (character, original)。"""
    done: set[tuple[str, str]] = set()
    if path.is_file():
        with path.open("r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    done.add((obj["character"], obj["original"]))
                except Exception:
                    # 跳过坏行
                    continue
    return done


def _append_jsonl(path: Path, items: list[dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


# 读取进度
done_keys = _load_done_keys(OUTPUT_PATH)
total = len(neutral_items)
remaining_items = [it for it in neutral_items if (it["character"], it["original"]) not in done_keys]

print(f"Total: {total}, Done: {len(done_keys)}, Remaining: {len(remaining_items)}")

if not remaining_items:
    print("All items are already processed.")
else:
    start_done = len(done_keys)
    batches_run = 0

    # 逐批处理
    while remaining_items:
        batch = remaining_items[:BATCH_SIZE]
        remaining_items = remaining_items[BATCH_SIZE:]

        print(f"Processing batch {batches_run + 1}: {len(batch)} items...")
        records = generate_neutral_corpus_with_CoT(batch)

        # 追加保存，便于中断后恢复
        _append_jsonl(OUTPUT_PATH, records)

        # 更新已完成键，便于重复运行时即时跳过
        for r in records:
            done_keys.add((r["character"], r["original"]))

        print(f"Saved {len(records)} items. Progress: {len(done_keys)}/{total}")
        batches_run += 1

    print(f"Run finished. New processed: {len(done_keys) - start_done}. Remaining: {total - len(done_keys)}")

Total: 6596, Done: 6511, Remaining: 0
All items are already processed.
