In [None]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig
from datetime import datetime
import re

max_seq_length=2048
dtype = None

In [None]:
model_name = "yourmodeldir"  # 你的本地模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# 加载本地微调后的模型
model, _ = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    attn_implementation="flash_attention_2",
    # load_in_4bit=True
)
FastLanguageModel.for_inference(model)
print(f"模型参数量: {model.num_parameters():,}")

In [None]:
## 这里不使用环境变量，直接填入wandb的token,如果没有token可以去官网下载一个
import wandb

wandb.login(key="yourkey")
run = wandb.init(
    project='ingratiation dsr18b',
    job_type="training",
    anonymous="allow"
)

In [None]:
# 修复JSON格式，确保'课程名称'和'分类'前面有逗号
def fix_json_commas(json_string):
    json_string = re.sub(r'([^,{])(\"课程名称\")', r'\1,\2', json_string)  # 只有当前面不是 `,` 或 `{` 时补逗号
    json_string = re.sub(r'([^,{])(\"分类\")', r'\1,\2', json_string)  # 只有当前面不是 `,` 或 `{` 时补逗号
    json_string = re.sub(r',\s*}', '}', json_string)
    return json_string

In [None]:
# 单次响应函数
@torch.no_grad()
def get_response(input_message):
    inputs = tokenizer(input_message, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.8,
        repetition_penalty=1.2,
        use_cache=True,
        do_sample=False,
    )
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response_text = response_text.split("</think>")[-1].strip()
    cleaned_res = response_text.replace(input_message, '').strip().replace('```json', '').replace('```', '').replace(' ', '').replace('\n', '').replace('\\"', '"')
    cleaned_res = re.sub("'", '"', cleaned_res)
    cleaned_res = re.sub(r':""(.*?)""', r':"\1"', cleaned_res)
    cleaned_res = fix_json_commas(cleaned_res)
    answer = response_text.split("</think>")[-1].strip()
    return answer

## 测试一下

In [None]:
prompt_style = """你是教育学专家，正在对学习者在线课堂中的弹幕进行行为分类。
[分类规则]={{
    "观点遵从"：明确或间接地附和教师观点，表达对教师论点或行为的一致认同。
    "恭维他人"：直接或间接赞美教师、强调教师的重要性与优点。
    "自我展现"：突出自己的见解、经验或成就，以期引起教师注意。
    "施恩他人"：表达对教师或同学的帮助意愿，用行动或建议来支持他人。
}}
### 指令:
你需要对每个弹幕内容进行判断，课程名称包含场景信息，回答仅输出JSON，不在4类分类中请输出"无"
[输出格式]=
{{
    "分类": str分类
}}
### 问题:
{}请仅输出[输出格式]内容，不要生成多余信息
"""

In [None]:
question = "{'弹幕内容': '是', '课程名称': '数据结构'}属于什么分类？"

In [None]:
res = get_response(prompt_style.format(question))
print(res)

In [None]:
def extract_json_from_text(text):
    """
    从文本中提取符合 {"弹幕内容": "XXX", "课程名称": "XXX", "分类": "XXX"} 结构的 JSON 数据
    """
    json_pattern = r'(\{.*?"分类".*?\})'
    matches = re.findall(json_pattern, text, re.DOTALL)
    
    for match in matches:
        try:
            parsed_json = json.loads(match)  # 解析 JSON
            return parsed_json  # 返回找到的 JSON 数据
        except json.JSONDecodeError:
            continue  # 解析失败，尝试下一个匹配项
    
    return None  # 没有找到有效 JSON

In [None]:
# 优化后的批量推理函数
@torch.no_grad()
def get_response_batch(questions):
    inputs = tokenizer(questions, return_tensors="pt", padding=True).to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.8,
        repetition_penalty=1.2,
        use_cache=True,
        do_sample=False,
    )
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    responses = []
    for question, response_text in zip(questions, decoded_outputs):
        try:
            response_text = response_text.split("</think>")[-1].strip()
            # print(response_text)
            cleaned_res = response_text.replace(question, '').strip().replace('```json', '').replace('```', '').replace(' ', '').replace('\n', '').replace('\\"', '"')
            cleaned_res = re.sub("'", '"', cleaned_res)
            cleaned_res = re.sub(r':""(.*?)""', r':"\1"', cleaned_res)
            cleaned_res = fix_json_commas(cleaned_res)
            print(f"cleaned_res={cleaned_res}")
            parsed = json.loads(cleaned_res)
            if isinstance(parsed, list):
                parsed = parsed[0] if parsed else {}
            responses.append(parsed)
        except json.JSONDecodeError:
            # **解析失败，尝试从文本中提取 JSON**
            extracted_json = extract_json_from_text(cleaned_res)
            if extracted_json:
                parsed = extracted_json  # 使用提取的 JSON
                responses.append(parsed)
            else:
                eee = {"error": "无法解析 JSON，且未找到有效 JSON 结构"}
                print(eee)
                # print(f"outputs={decoded_outputs}")
                
                responses.append({"error": str(eee)})
                # print("本轮结束")
                # break
        except Exception as e:
            responses.append({"error": str(e)})
    return responses


In [None]:
# 加载train test数据
with open('Ingratiation/train_data0320.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)
with open('Ingratiation/test_data0320.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

In [None]:
def process_data(data, data_type="train", BATCH_SIZE=8):
    """
    批量处理函数
    :param data: 需要处理的数据（列表，每个元素为包含 "Question" 和 "Response" 的字典）
    :param data_type: 字符串，用于区分 'train' 或 'test'，将影响输出文件命名
    :param BATCH_SIZE: 每次批量处理的数据条数
    """
    
    results = []
    error_logs = []

    # 按 BATCH_SIZE 处理数据
    for i in range(0, len(data), BATCH_SIZE):
        # 打印处理进度
        if i % 256 == 0:
            print(f"第{i}个正在处理，当前类型：{data_type}")
        batch_data = data[i : i + BATCH_SIZE]
        questions_batch = [prompt_style.format(item["Question"]) for item in batch_data]

        # 批量请求接口
        batch_responses = get_response_batch(questions_batch)
        # print(f"第{i}个输出：{batch_responses}")
        for j, response_data in enumerate(batch_responses):
            original_response = batch_data[j]["Response"]
            original_response_gt = original_response["分类"]
            # 判断是否返回错误或字段缺失
            if (
                "error" in response_data 
                or not all(key in response_data for key in ["分类"])
            ):
                # 收集错误信息
                error_logs.append({
                    "Question": batch_data[j]["Question"],
                    "原分类": original_response_gt,
                    "错误信息": response_data.get("error", "响应缺少必要字段")
                })
                continue  # 跳过失败的条目

            # 存储成功的结果
            results.append({
                "Question": batch_data[j]["Question"],
                "分类": response_data["分类"],
                "原分类": original_response_gt
            })

    # 获取当前时间戳
    current_time = datetime.now().strftime("%Y%m%d%H%M")

    # 根据 data_type 区分输出文件的命名
    if data_type.lower() == "train":
        result_file = f"df_train_{current_time}.xlsx"
        error_file = f"error_train_log_{current_time}.xlsx"
    elif data_type.lower() == "after_train":
        result_file = f"df_after_train_{current_time}.xlsx"
        error_file = f"error_after_train_log_{current_time}.xlsx"
    elif data_type.lower() == "after_test":
        result_file = f"df_after_test_{current_time}.xlsx"
        error_file = f"error_after_test_log_{current_time}.xlsx"
    else:
        # 默认使用 test 命名
        result_file = f"df_test_{current_time}.xlsx"
        error_file = f"error_test_log_{current_time}.xlsx"
    df_result = pd.DataFrame()
    df_error = pd.DataFrame()
    # 保存结果
    if results:
        df_result = pd.DataFrame(results)
        df_result.to_excel(result_file, index=False)
        print(f"{data_type} 处理完成，结果已保存至 {result_file}")
    
    if error_logs:
        df_error = pd.DataFrame(error_logs)
        df_error.to_excel(error_file, index=False)
        print(f"{data_type} 处理完成，错误日志已保存至 {error_file}")
    return df_result,df_error


### 训练集模型分类情况

In [None]:
current_time1 = datetime.now()
df_train_result,df_train_error_result = process_data(train_data, data_type="train", BATCH_SIZE=128)
current_time2 = datetime.now()
print(f'用时{(current_time2-current_time1).total_seconds()}')

In [None]:
print(len(df_train_result))
print(len(df_train_error_result))

## 测试集

In [None]:
df_test_result,df_test_error_result = process_data(test_data, data_type="test", BATCH_SIZE=128)

In [None]:
print(len(df_test_result))
print(len(df_test_error_result))

## SFT

In [None]:
train_prompt_style = """你是教育学专家，正在对学习者在线课堂中的弹幕进行行为分类。
[分类规则]={{
    "观点遵从"：明确或间接地附和教师观点，表达对教师论点或行为的一致认同。
    "恭维他人"：直接或间接赞美教师、强调教师的重要性与优点。
    "自我展现"：突出自己的见解、经验或成就，以期引起教师注意。
    "施恩他人"：表达对教师或同学的帮助意愿，用行动或建议来支持他人。
}}
### 指令:
你需要对每个弹幕内容进行判断，课程名称包含场景信息，回答仅输出JSON格式，不在4类分类中请输出"无"
[输出格式]=
{{
    "分类": str分类
}}
### 问题:
{}你的回答内容必须是合法 JSON 结构，不可包含多余说明
### 思考:
{}
### 回答：
{}"""

In [None]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN
from transformers import PreTrainedTokenizerFast
def formatting_prompts_func(examples):
    inputs = examples["Question"]  # 问题
    cots = examples["Complex_CoT"]  # 复杂思考
    outputs = examples["Response"]  # 模型最终回答
    texts = []
    labels = []

    for input, cot, output in zip(inputs, cots, outputs):
        # 生成完整的训练输入文本
        full_text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(full_text)

        # Tokenize 整个输入
        tokenized = tokenizer(full_text, truncation=True, max_length=1024, padding="max_length")

        # Tokenize CoT + Response（计算损失部分）
        response_text = str(output) + EOS_TOKEN
        tokenized__response = tokenizer(response_text, truncation=True, max_length=256, padding="max_length")

        # 获取 Question 的 Token 长度
        input_len = len(tokenized["input_ids"]) - len(tokenized__response["input_ids"])

        # 生成 labels，前面 Question 部分设为 -100（忽略计算损失），CoT + Response 计算损失
        label_ids = [-100] * input_len + tokenized__response["input_ids"]
        label_ids = label_ids + [-100] * (1024 - len(label_ids))
        labels.append(label_ids)

    return {
        "text": texts,
        "input_ids": [tokenizer(text, truncation=True, max_length=1024)["input_ids"] for text in texts],
        "attention_mask": [tokenizer(text, truncation=True, max_length=1024)["attention_mask"] for text in texts],
        "labels": labels,
    }


In [None]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
dataset = load_dataset(
    "json",
    data_files="Ingratiation/train_data0320.json",  # 直接指定文件路径
    split="train",
)
df = pd.DataFrame(dataset)

# 3) 从 Response 提取分类标签
# -------------------------------
df["label"] = df["Response"].apply(lambda x: x["分类"])

# 4) 使用 sklearn 进行分层切分
train_df, eval_df = train_test_split(
    df,
    test_size=0.1,       # 20% 验证集
    random_state=42,
    stratify=df["label"] # 按 label 分层抽样
)

# -------------------------------
# 5) 转回 Hugging Face Dataset
# -------------------------------
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
eval_dataset = Dataset.from_pandas(eval_df, preserve_index=False)

# -------------------------------
# 6) 对训练集和验证集执行 map
# -------------------------------
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

# eval_dataset[0]


In [None]:
del model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    attn_implementation="flash_attention_2",
)

In [None]:
FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth", 
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # 训练集
    eval_dataset=eval_dataset,    # 验证集
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,  # 验证集的批量大小
        gradient_accumulation_steps=1,
        warmup_steps=10,
        max_steps=300,
        learning_rate=5e-5,
        optim="adamw_hf",
        weight_decay=0.1,
        lr_scheduler_type="cosine",
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        tf32=True,
        gradient_checkpointing=True,
        logging_steps=10,
        evaluation_strategy="steps",  # 按步数评估
        eval_steps=10,               # 每 20 步评估一次
        save_strategy="steps",       # 按步数保存模型
        save_steps=10,               # 每 20 步保存一次模型
        save_total_limit=2,          # 最多保存 2 个检查点
        load_best_model_at_end=True, # 训练结束时加载最佳模型
        metric_for_best_model="eval_loss",  # 根据验证集损失选择最佳模型
        greater_is_better=False,    # 损失越小越好
        seed=42,
        output_dir="outputs_ingratiation",
    ),
)

In [None]:
# 开始训练
trainer_stats = trainer.train()

In [None]:
wandb.finish()

### 本地保存模型

In [None]:
new_model_local = "DeepSeek-R1-Ingratiation-COT-0317"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

## 运行本地保存的模型

In [None]:
model_name = "DeepSeek-R1-Ingratiation-COT-0317"  # 你的本地模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, _ = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype = dtype,
    attn_implementation="flash_attention_2",
)
FastLanguageModel.for_inference(model)

## 训练后模型表现

In [None]:
# 批量处理训练集
current_time1 = datetime.now()
df_train_after_SFT_result,df_train_after_SFT_error = process_data(train_data[:256], data_type="after_train", BATCH_SIZE=128)
current_time2 = datetime.now()
print(f'用时{(current_time2-current_time1).total_seconds()}')

In [None]:
print(len(df_train_after_SFT_result))
print(len(df_train_after_SFT_error))

In [None]:
df_test_after_SFT_result,df_test_after_SFT_error = process_data(test_data, data_type="after_test", BATCH_SIZE=128)

In [None]:
print(len(df_test_after_SFT_result))
print(len(df_test_after_SFT_error))

## 训练后训练集测试集分类情况

In [None]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    cohen_kappa_score
)

In [None]:
# # 定义评估函数
def evaluate_model(true_labels, pred_labels):
    metrics = {
        'Accuracy': accuracy_score(true_labels, pred_labels),
        'Precision(macro)': precision_score(true_labels, pred_labels, average='macro'),
        'Recall(macro)': recall_score(true_labels, pred_labels, average='macro'),
        'F1(macro)': f1_score(true_labels, pred_labels, average='macro'),
        'Cohen Kappa': cohen_kappa_score(true_labels, pred_labels)
    }
    return {k: round(v, 4) for k, v in metrics.items()}

# 读取数据文件
file_map = {
    '微调前-训练集': 'yourfile_beforeSFT_train.xlsx',
    '微调前-测试集': 'yourfile_beforeSFT_test.xlsx',
    '微调后-训练集': 'yourfile_afterSFT_train.xlsx',
    '微调后-测试集': 'yourfile_afterSFT_test.xlsx'
}

results = {}

# 处理所有数据集
for desc, filename in file_map.items():
    df = pd.read_excel(filename, engine='openpyxl')
    
    # 确保列名正确
    if '原分类' not in df.columns or '分类' not in df.columns:
        raise ValueError(f"{filename} 中缺少必要的列 '原分类' 或 '分类'")
    
    # 计算指标
    metrics = evaluate_model(df['原分类'], df['分类'])
    results[desc] = metrics

# 转换为DataFrame方便比较
results_df = pd.DataFrame(results).T

# 打印完整结果
print("完整评估结果：")
print(results_df)

# 对比分析
print("\n对比分析（微调前后差异）：")

# 比较训练集
train_diff = results_df.loc['微调后-训练集'] - results_df.loc['微调前-训练集']
train_diff.name = '训练集差异'

# 比较测试集
test_diff = results_df.loc['微调后-测试集'] - results_df.loc['微调前-测试集']
test_diff.name = '测试集差异'

# 合并对比结果
comparison = pd.concat([train_diff, test_diff], axis=1).T
print(comparison)