# 1. 加载数据

In [1]:
# 使用trainsformers库，使用datasets加载数据配合pipeline, 通过self-consistent推理数据
import datasets

import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd
import re
import json
import torch
import dotenv
from transformers.pipelines.pt_utils import KeyDataset
dotenv.load_dotenv()

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].
    
    Params:
    dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(os.path.isdir(os.path.join(root, d)) for root, dirs, _ in os.walk(dir_path) for d in dirs)
    data=[0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")
    
    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i=0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
          try:
            with open(os.path.join(folder_path, 'file_1.txt'), 'r', encoding='utf-8') as f1:
              text1 = f1.read().strip()
            with open(os.path.join(folder_path, 'file_2.txt'), 'r', encoding='utf-8') as f2:
              text2 = f2.read().strip()
            index = int(folder_name[-4:])
            data[i]=(index, text1, text2)
            i+=1
          except Exception as e:
            print(f"Error reading directory {folder_name}: {e}")
    
    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=['id', 'file_1', 'file_2']).set_index('id')
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = os.environ.get("ROOT_DATA_DIR")
data_name = "fake-or-real-the-impostor-hunt"
test_path=os.path.join(data_dir, data_name, "test")
df_test=read_texts_from_dir(test_path)

Number of directories: 1068


In [3]:
df_test.head()

Unnamed: 0_level_0,file_1,file_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,This research aimed to understand how star sha...,ChromeDriver music player\nThis study focused ...
3,Using OmegaCAM's wide field capabilities spann...,"greek translation :\nvazhi (megaCAM), territor..."
4,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...


# 2. 加载模型

In [4]:
# 模型路径
model_dir = os.environ.get("ROOT_MODEL_DIR")
model_name = "Qwen/Qwen3-8B"
model_path = os.path.join(model_dir, model_name)

# 加载tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="cuda:3"
)

Loading checkpoint shards: 100%|██████████| 5/5 [00:22<00:00,  4.52s/it]


# 2. 数据预处理

In [5]:
# 定义系统提示 - 标准化输出格式
system_prompt = """You are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.

To ensure consistent evaluation, please follow this strict output format:
1. First, you may analyze and reason through the samples (this part will not be evaluated)
2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}
3. Do not include any other content outside the \\boxed{} tag
4. Examples of correct format: \\boxed{1} or \\boxed{2}
5. Any output not following this format will be considered invalid"""

def create_prompt(sample1, sample2):
    """构建提示文本，包含明确的标准化输出要求"""
    return """Below are two text samples. Determine which is real and which is fake.

Sample 1: [[sample1]]

Sample 2: [[sample2]]

Please follow these steps:
1. You may first compare their content, logical consistency, information accuracy, and language style
2. Then, provide your final answer by putting only the number of the real sample (1 or 2) within \\boxed{}
3. Ensure there is no other text outside the \\boxed{} tag

Example of correct output:
After comparing the two samples, I think Sample 1 is more consistent and accurate. Answer: \\boxed{1}

Example of incorrect output:
2 (This is missing the required boxed format)""".replace("[[sample1]]", sample1).replace("[[sample2]]", sample2)

def process_sample(sample1, sample2):
    # 构建对话消息
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": create_prompt(sample1[:4096], sample2[:4096])}
        ]
        
        # 应用聊天模板
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=True
        )

        return prompt

In [6]:
# 转为datasets格式
data = datasets.Dataset.from_pandas(df_test)
data[0]

{'file_1': '"Music" Music music music Music music Music music Music music\nThe two telescopes using "Music" have been incredibly busy since their launch! They\'re incredibly popular for research on Earthly objects like star clusters or celestial bodies like planets or even galaxies far away! They\'ve produced many scientific publications within just a few years - so many so that they dominate by far when it comes to peer reviewed articles published from those telescopes! These musical journey has produced over a hundred articles published through various outlets like journals such as \'Nature\'. Some notable achievements include discovering near space objects found between star systems as well as identifying cosmic events such as how gravity affects light waves traveling through spacemusic\nBeyond its contributions to individual research projects with specific scientific goals (like finding new types or patterns) , it also contributes significantly to our overall understanding by being

In [7]:
processed_data = data.map(lambda x: {"prompt": process_sample(x["file_1"], x["file_2"])}, remove_columns=["file_1", "file_2"])
# 展示数据
processed_data["prompt"]

Map: 100%|██████████| 1068/1068 [00:00<00:00, 3719.83 examples/s]


Column(['<|im_start|>system\nYou are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.\n\nTo ensure consistent evaluation, please follow this strict output format:\n1. First, you may analyze and reason through the samples (this part will not be evaluated)\n2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}\n3. Do not include any other content outside the \\boxed{} tag\n4. Examples of correct format: \\boxed{1} or \\boxed{2}\n5. Any output not following this format will be considered invalid<|im_end|>\n<|im_start|>user\nBelow are two text samples. Determine which is real and which is fake.\n\nSample 1: "Music" Music music music Music music Music music Music music\nThe two telescopes using "Music" have been incredibly busy since their launch! They\'re incredibly 

# 推理与保存

In [8]:
# 根据思维模式获取最佳采样参数
def get_sampling_parameters(enable_thinking=True):
    if enable_thinking:
        return {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "min_p": 0.0,
            "do_sample": True,
        }
    else:
        return {
            "temperature": 0.7,
            "top_p": 0.8,
            "top_k": 20,
            "min_p": 0.0,
            "do_sample": True,
        }

# 初始化文本生成pipeline
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Device set to use cuda:3


In [9]:
# 提取答案的函数（从生成文本中提取\boxed{1}或\boxed{2}）
def extract_answer(generated_text):
    """从文本中提取\\boxed{1}或\\boxed{2}，无效则返回None"""
    match = re.search(r'\\boxed\{(1|2)\}', generated_text)
    if match:
            answer = match.group(1)
    else:
        match = re.search(r'\{([12])\}', generated_text)
        if match:
            answer = match.group(1)
        elif "1" in generated_text[-25:] and "2" not in generated_text[-25:]:
            answer = '1'
        elif "1" not in generated_text[-25:] and "2" in generated_text[-25:]:
            answer = '2'
        else:  
            print(f"警告: 无效的判断结果 - {generated_text[-50:]}...，默认返回1")
            answer = '1'

    return answer

def get_consensus_answer(generated_texts, default="1"):
    """从多条生成结果中获取一致性答案"""
    # 提取所有有效答案
    valid_answers = [extract_answer(text["generated_text"]) for text in generated_texts]
    
    if not valid_answers:
        # 无有效答案，返回默认值
        return default
    
    # 统计1和2的数量
    count_1 = valid_answers.count("1")
    count_2 = valid_answers.count("2")
    
    if count_1 > count_2:
        return "1"
    elif count_2 > count_1:
        return "2"
    else:
        # 数量相同，使用默认值（或可结合生成概率进一步判断）
        return default

In [10]:
pred_data = pd.read_csv("data/self_consistent_results.csv")


In [11]:
idx = pred_data.iloc[-1, 0]
print(idx)

945


In [12]:
# 截取idx后面的，不包括idx
processed_data = processed_data.select(range(idx+1, len(processed_data)))

In [13]:
processed_data[0]

{'id': 946,
 'prompt': '<|im_start|>system\nYou are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.\n\nTo ensure consistent evaluation, please follow this strict output format:\n1. First, you may analyze and reason through the samples (this part will not be evaluated)\n2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}\n3. Do not include any other content outside the \\boxed{} tag\n4. Examples of correct format: \\boxed{1} or \\boxed{2}\n5. Any output not following this format will be considered invalid<|im_end|>\n<|im_start|>user\nBelow are two text samples. Determine which is real and which is fake.\n\nSample 1: The main ESO archive query form is a web tool that allows users to search the ESO archive using Target Name, Coordinates, Observing Date, or Progra

In [14]:
# 处理并即时保存
def process_and_save(text_generator, processed_data, output_file="results.csv"):
    # 初始化CSV文件
    if not os.path.exists(output_file):
        pd.DataFrame(columns=['id', 'real_text_id']).to_csv(
            output_file, index=False, encoding='utf-8'
        )
    i = idx + 1  # 从上次中断的地方继续
    for batch_out in text_generator(  # 此时batch_out是包含5条结果的列表
        KeyDataset(processed_data, "prompt"),
        max_new_tokens=4096,
        return_full_text=False,
        **get_sampling_parameters(enable_thinking=True),
        num_return_sequences=5  # 一次返回5条结果
    ):
        # 计算一致性答案
        final_answer = get_consensus_answer(batch_out)
        
        # 即时保存
        pd.DataFrame([{
            'id': i,
            'real_text_id': final_answer
        }]).to_csv(
            output_file,
            mode='a',
            header=False,
            index=False,
            encoding='utf-8'
        )
        if i % 10 == 0:
            print(f"已保存id={i}的结果：{final_answer}")
        i += 1

        del batch_out, final_answer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()  # 确保缓存清理完成

# 使用示例
process_and_save(text_generator, processed_data, "data/self_consistent_results.csv")

已保存id=950的结果：2
已保存id=960的结果：1
已保存id=970的结果：2
警告: 无效的判断结果 - ლი კრედიტი მინიჭებული კრედიტი მინიჭებული კრედიტი მ...，默认返回1
已保存id=980的结果：2
已保存id=990的结果：2
已保存id=1000的结果：1
已保存id=1010的结果：1
已保存id=1020的结果：2
已保存id=1030的结果：1
已保存id=1040的结果：1
已保存id=1050的结果：2
已保存id=1060的结果：1
