# General Fine-tuning Model based on Qwen3-4B (Baseline C)

In [None]:
# Run this on Remote Jupyter Book.

import os

os.getcwd()
# os.chdir("/root/OtakuLab")

## Define Training Set Format

In [8]:
from typing_extensions import TypedDict
from typing import Optional
from pathlib import Path
import json
import re

class InstructionComponents(TypedDict):
    lexical_keywords: list[str]
    pragmatic_styles: list[str]

class DatasetItem(TypedDict):
    character: str
    neutral_sentence: str
    instruction_components: InstructionComponents | dict[str, list[str]|dict]
    output: str

class DatasetStorage:
    def __init__(self) -> None:
        self.items: dict[str, DatasetItem] = {}

    def __len__(self) -> int:
        return len(self.items)

    def new_item(self, character: str, neutral_sentence: str, output: str):
        item = DatasetItem(character=character,
                           neutral_sentence=neutral_sentence,
                           instruction_components={},
                           output=output)
        self.items[output] = item

    def save_characters_keywords(self, character: str, lexical_keywords: list[str]):
        saved = False
        for item in self.items.values():
            if item['character'] == character:
                output = item['output']
                self.items[output]['instruction_components']['lexical_keywords'] = lexical_keywords
                saved = True
        if not saved:
            raise ValueError(f"角色 {character} 未找到")        

    def save_component(self,
                       output: str,
                       pragmatic_styles: Optional[list[str]] = None):
        if output not in self.items.keys():
            raise ValueError(f"风格句: {output} 似乎未加载")
        
        if pragmatic_styles:
            self.items[output]['instruction_components']['pragmatic_styles'] = pragmatic_styles

    @staticmethod
    def _verify_validity(item: DatasetItem):
        if not all((item['character'],
                    item['neutral_sentence'],
                    item['output'],
                    item['instruction_components'])):
            return False
        
        instruction_components = item['instruction_components']
        
        return all((instruction_components.get('lexical_keywords', None),
                    instruction_components.get('pragmatic_styles', None)))

    def output(self, output_path: Path):
        items: list[DatasetItem] = list(self.items.values())
        vaild_items = []
        vaild_items_count = 0
        
        for item in items:
            if not self._verify_validity(item):
                continue

            instr = item["instruction_components"]
            keywords = ", ".join(instr["lexical_keywords"]) if instr["lexical_keywords"] else "None"
            pragmatic_styles = ", ".join(instr["pragmatic_styles"]) if instr["pragmatic_styles"] else "None"
            user_prompt = (
                f"Target Character {item['character']}\n"
                f"Personality: {pragmatic_styles}\n"
                f"Keywords: {keywords}\n"
                f"Neutral Content: {item['neutral_sentence']}\n"
            )

            # 构建 ShareGPT 格式
            conversation = {
                "conversations": [
                    {
                        "from": "human",
                        "value": user_prompt
                    },
                    {
                        "from": "gpt",
                        "value": item['output']
                    }
                ],
                "system": f"You are a style transfer expert. Your task is to generate a new sentence that matches the target style, based on the content of a neutral sentence."
            }

            vaild_items.append(conversation)
            vaild_items_count += 1


        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(vaild_items, f, ensure_ascii=False, indent=2)

        print(f"训练集已导出至 {output_path}, 总有效训练集数量: {len(vaild_items)}, 跳过数量: {len(items) - vaild_items_count}")

dataset_storage = DatasetStorage()

## Load Neutral Sentences

In [9]:
neutral_sentences_jsonl_file = Path("./data/neutral_sentences_with_CoT.jsonl")

EN_NAME_TO_ZH = {"Muice": "沐雪", "Ayaka": "神里绫华", "Zhongli": "钟离", "Hutao": "胡桃", "Haruhi": "凉宫春日"}

class NSFileItem(TypedDict):
    character: str
    original: str
    neutral: str
    CoT: str

def load_jsonl_file(jsonl_file: Path):
    with open(jsonl_file, "r", encoding="utf-8") as f:
        jsonl_file_lines = f.readlines()

    for line in jsonl_file_lines:
        if line := line.rstrip():
            item: NSFileItem = json.loads(line)
            character = EN_NAME_TO_ZH.get(item["character"], item["character"])
            neutral = item["neutral"]
            dataset_storage.new_item(character, neutral, item["original"])

load_jsonl_file(neutral_sentences_jsonl_file)
print(f"成功加载了 {len(dataset_storage)} 条训练集条目")

成功加载了 6510 条训练集条目


## Process Lexical Layer Vectors

In [10]:
def get_lexical_keywords_from_file(file: Path, top_n: int = 25) -> list[str]:
    with open(file, "r", encoding="utf-8") as f:
        data: dict[str, float] = json.loads(f.read())
    return list(data.keys())[:top_n]

Lexical_Muice = get_lexical_keywords_from_file(Path("./outputs/pmi/muice_pmi_filtered.json"))
Lexical_Ayaka = get_lexical_keywords_from_file(Path("./outputs/pmi/ayaka_pmi_filtered.json"))
Lexical_Zhongli = get_lexical_keywords_from_file(Path("./outputs/pmi/zhongli_pmi_filtered.json"))
Lexical_Hutao = get_lexical_keywords_from_file(Path("./outputs/pmi/hutao_pmi_filtered.json"))
Lexical_Haruhi = get_lexical_keywords_from_file(Path("./outputs/pmi/haruhi_pmi_filtered.json"))

dataset_storage.save_characters_keywords("沐雪", Lexical_Muice)
dataset_storage.save_characters_keywords("神里绫华", Lexical_Ayaka)
dataset_storage.save_characters_keywords("钟离", Lexical_Zhongli)
dataset_storage.save_characters_keywords("胡桃", Lexical_Hutao)
dataset_storage.save_characters_keywords("凉宫春日", Lexical_Haruhi)


## Process Pragmatic Style Layer Vectors

### Read Pragmatic Style Files for Each Corpus

In [11]:
Pragmatic_Muice = "./outputs/pragmatic/muice.jsonl"
Pragmatic_ayaka = "./outputs/pragmatic/ayaka.jsonl"
Pragmatic_zhongli = "./outputs/pragmatic/zhongli.jsonl"
Pragmatic_hutao = "./outputs/pragmatic/hutao.jsonl"
Pragmatic_haruhi = "./outputs/pragmatic/haruhi.jsonl"

class RawPCFGItem(TypedDict):
    prompt: str
    response: str
    pragmatic_styles: list[dict[str, float]]

class PCFGItem(TypedDict):
    response: str
    pragmatic_styles: list[str]

def read_pcfg_jsonl_file(jsonl_file: Path, threshold: Optional[float] = None) -> list[PCFGItem]:
    with open(jsonl_file, "r", encoding="utf-8") as f:
        lines = f.readlines()

    raw_items: list[RawPCFGItem] = []
    items: list[PCFGItem] = []

    for line in lines:
        if line := line.strip():
            raw_item: RawPCFGItem = json.loads(line)
            raw_items.append(raw_item)

    # list[dict[str, float]] -> dict[str, float] -> list[str]
    for raw_item in raw_items:
        raw_pragmatic_styles = raw_item["pragmatic_styles"]
        pragmatic_styles: dict[str, float] = {}

        for vec in raw_pragmatic_styles:
            pragmatic_styles.update(vec)

        threshold = threshold or 0
        final_styles: list[str] = []

        for key, value in pragmatic_styles.items():
            if value > threshold:
                final_styles.append(key)
        
        item = PCFGItem(response=raw_item["response"], pragmatic_styles=final_styles)
        items.append(item)

    return items

pcfg_muice_items = read_pcfg_jsonl_file(Path(Pragmatic_Muice), 0.4)
pcfg_ayaka_items = read_pcfg_jsonl_file(Path(Pragmatic_ayaka), 0.4)
pcfg_zhongli_items = read_pcfg_jsonl_file(Path(Pragmatic_zhongli), 0.4)
pcfg_hutao_items = read_pcfg_jsonl_file(Path(Pragmatic_hutao), 0.4)
pcfg_haruhi_items = read_pcfg_jsonl_file(Path(Pragmatic_haruhi), 0.4)

pcfg_import_items = pcfg_muice_items + pcfg_ayaka_items + pcfg_zhongli_items + pcfg_hutao_items + pcfg_haruhi_items

skiped = 0
for item in pcfg_import_items:
    try:
        dataset_storage.save_component(item["response"], pragmatic_styles=item["pragmatic_styles"])
    except ValueError as e:
        skiped += 1
        continue

print(f"更新了 {len(pcfg_import_items)} 条训练集条目的风格向量，已跳过: {skiped} 条")


更新了 7377 条训练集条目的风格向量，已跳过: 714 条


## Export Training Set File

In [13]:
OUTPUT_PATH = Path("./data/vanilla_llm_train.json")

dataset_storage.output(OUTPUT_PATH)

训练集已导出至 evaluate\dataset\vanilla_llm_train.json, 总有效训练集数量: 5786, 跳过数量: 724
