# Per-Character Training Set Construction and Generation

In [None]:
import os

os.getcwd()

# os.chdir()

## Training Set Construction

In [None]:
import json
import os

# 定义路径 (基于当前工作目录: OtakuLab)
input_file = "data/neutral_sentences_with_CoT.jsonl"
output_dir = "data/per_character_train"
dataset_info_path = os.path.join(output_dir, "dataset_info.json")

# 角色英文名到中文名的映射
EN_NAME_TO_ZH = {
    "Muice": "沐雪",
    "Ayaka": "神里绫华",
    "Zhongli": "钟离",
    "Hutao": "胡桃",
    "Haruhi": "凉宫春日"
}

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)

# 读取数据并按角色分组
character_data = {}

with open(input_file, 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        char = item['character']
        
        # 获取角色中文名，如果不在映射中则使用原名
        char_zh = EN_NAME_TO_ZH.get(char, char)
        
        # 映射字段
        neutral_sentence = item['neutral']
        thinking_process = item['CoT']
        output_sentence = item['original']
        
        # 构建 ShareGPT 格式
        conversation = {
            "conversations": [
                {
                    "from": "human",
                    "value": f"Neutral Content: {neutral_sentence}"
                },
                {
                    "from": "gpt",
                    "value": f"{thinking_process}\n\n{output_sentence}"
                }
            ],
            "system": f"You are a style transfer expert. Your task is to mimic the personality of {char_zh} and generate a new sentence that matches the (s)he style, based on the content of a neutral sentence."
        }
        
        if char not in character_data:
            character_data[char] = []
        character_data[char].append(conversation)

# 保存每个角色的数据集并构建 dataset_info.json
dataset_info = {}

for char, data in character_data.items():
    # 处理文件名（避免非法字符）
    safe_char_name = "".join([c for c in char if c.isalnum() or c in (' ', '_', '-')]).strip()
    file_name = f"{safe_char_name}_train.json"
    file_path = os.path.join(output_dir, file_name)
    
    # 保存角色数据
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    # 添加到 dataset_info
    dataset_info[f"{safe_char_name}_style_train"] = {
        "file_name": file_name,
        "formatting": "sharegpt",
        "columns": {
            "messages": "conversations",
            "system": "system",
        }
    }

# 保存 dataset_info.json
with open(dataset_info_path, 'w', encoding='utf-8') as f:
    json.dump(dataset_info, f, ensure_ascii=False, indent=2)

print(f"处理完成！共处理 {len(character_data)} 个角色。")
print(f"数据集已保存至: {os.path.abspath(output_dir)}")
print(f"配置文件已保存至: {os.path.abspath(dataset_info_path)}")

In [4]:
# 根据数据量生成训练脚本
print("### Generated Training Commands ###\n")

for char, data in character_data.items():
    count = len(data)
    safe_char_name = "".join([c for c in char if c.isalnum() or c in (' ', '_', '-')]).strip()
    dataset_name = f"{safe_char_name}_style_train"
    output_dir = f"outputs/Per-Character/{safe_char_name}"
    
    # 默认参数
    params = {
        "per_device_train_batch_size": 4,
        "gradient_accumulation_steps": 8,
        "learning_rate": "6e-4",
        "warmup_ratio": 0.03,
        "num_train_epochs": 5,
        "weight_decay": 0.01,
        "lora_dropout": 0.05
    }
    
    scheme = "A"
    
    # 根据数据量调整参数
    if count < 600:
        scheme = "C"
        params.update({
            "per_device_train_batch_size": 2,
            "gradient_accumulation_steps": 16,
            "learning_rate": "2e-4",
            "warmup_ratio": 0.1,
            "num_train_epochs": 8,
            "weight_decay": 0.02,
            "lora_dropout": 0.1
        })
    elif count < 1500:
        scheme = "B"
        params.update({
            "per_device_train_batch_size": 2,
            "gradient_accumulation_steps": 16,
            "learning_rate": "4e-4",
            "warmup_ratio": 0.05,
            "num_train_epochs": 5,
            "weight_decay": 0.01,
            "lora_dropout": 0.05
        })
    
    print(f"# Character: {char} | Count: {count} | Scheme: {scheme}")
    
    command = f"""llamafactory-cli train \\
    --model_name_or_path /root/autodl-tmp/Qwen3-4B/ \\
    --template qwen3 \\
    --stage sft \\
    --do_train \\
    --finetuning_type lora \\
    --dataset {dataset_name} \\
    --lora_rank 32 \\
    --lora_alpha 16 \\
    --lora_dropout {params['lora_dropout']} \\
    --max_samples 10000 \\
    --overwrite_cache \\
    --per_device_train_batch_size {params['per_device_train_batch_size']} \\
    --gradient_accumulation_steps {params['gradient_accumulation_steps']} \\
    --learning_rate {params['learning_rate']} \\
    --weight_decay {params['weight_decay']} \\
    --max_grad_norm 1.0 \\
    --num_train_epochs {params['num_train_epochs']} \\
    --lr_scheduler_type cosine \\
    --warmup_ratio {params['warmup_ratio']} \\
    --logging_steps 10 \\
    --save_steps 300 \\
    --plot_loss \\
    --output_dir {output_dir} \\
    --overwrite_output_dir \\
    --report_to none"""
    
    print(command)
    print("\n" + "="*50 + "\n")

### Generated Training Commands ###

# Character: Muice | Count: 2704 | Scheme: A
llamafactory-cli train \
    --model_name_or_path /root/autodl-tmp/Qwen3-4B/ \
    --template qwen3 \
    --stage sft \
    --do_train \
    --finetuning_type lora \
    --dataset Muice_style_train \
    --lora_rank 32 \
    --lora_alpha 16 \
    --lora_dropout 0.05 \
    --max_samples 10000 \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 8 \
    --learning_rate 6e-4 \
    --weight_decay 0.01 \
    --max_grad_norm 1.0 \
    --num_train_epochs 5 \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.03 \
    --logging_steps 10 \
    --save_steps 300 \
    --plot_loss \
    --output_dir outputs/Per-Character/Muice \
    --overwrite_output_dir \
    --report_to none


# Character: Ayaka | Count: 1416 | Scheme: B
llamafactory-cli train \
    --model_name_or_path /root/autodl-tmp/Qwen3-4B/ \
    --template qwen3 \
    --stage sft \
    --do_train \
    --finet