# 1. 加载数据

In [1]:
# 使用OpenAI接口进行推理，去除自一致性
import datasets
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

import os
import pandas as pd
import re
import json
import dotenv
dotenv.load_dotenv()

# 初始化OpenAI客户端
client = openai.OpenAI(
    api_key="EMPTY",
    base_url="http://0.0.0.0:8192/v1"
)

def read_texts_from_dir(dir_path):
    """
    Reads the texts from a given directory and saves them in the pd.DataFrame with columns ['id', 'file_1', 'file_2'].
    
    Params:
    dir_path (str): path to the directory with data
    """
    # Count number of directories in the provided path
    dir_count = sum(os.path.isdir(os.path.join(root, d)) for root, dirs, _ in os.walk(dir_path) for d in dirs)
    data=[0 for _ in range(dir_count)]
    print(f"Number of directories: {dir_count}")
    
    # For each directory, read both file_1.txt and file_2.txt and save results to the list
    i=0
    for folder_name in sorted(os.listdir(dir_path)):
        folder_path = os.path.join(dir_path, folder_name)
        if os.path.isdir(folder_path):
          try:
            with open(os.path.join(folder_path, 'file_1.txt'), 'r', encoding='utf-8') as f1:
              text1 = f1.read().strip()
            with open(os.path.join(folder_path, 'file_2.txt'), 'r', encoding='utf-8') as f2:
              text2 = f2.read().strip()
            index = int(folder_name[-4:])
            data[i]=(index, text1, text2)
            i+=1
          except Exception as e:
            print(f"Error reading directory {folder_name}: {e}")
    
    # Change list with results into pandas DataFrame
    df = pd.DataFrame(data, columns=['id', 'file_1', 'file_2']).set_index('id')
    return df

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = os.environ.get("ROOT_DATA_DIR")
data_name = "fake-or-real-the-impostor-hunt"
test_path=os.path.join(data_dir, data_name, "test")
df_test=read_texts_from_dir(test_path)

Number of directories: 1068


In [3]:
df_test.head()

Unnamed: 0_level_0,file_1,file_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"""Music"" Music music music Music music Music mu...",Since its launch on Paranal observatory's Very...
1,underground exploration on SN's birth has prov...,SN 1987A provides valuable insights as newer o...
2,This research aimed to understand how star sha...,ChromeDriver music player\nThis study focused ...
3,Using OmegaCAM's wide field capabilities spann...,"greek translation :\nvazhi (megaCAM), territor..."
4,AssemblyCulture AssemblyCulture AssemblyCultur...,XClass is software tool that helps astronomers...


# 2. 加载模型

In [None]:
# OpenAI模型配置
MODEL_NAME = "Qwen/Qwen3-8B"  # 或者使用 "gpt-3.5-turbo" 等其他模型
MAX_TOKENS = 4096
TEMPERATURE = 0.1  # 较低温度确保一致性

print(f"使用模型: {MODEL_NAME}")
print(f"API Base: {os.environ.get('OPENAI_API_BASE', 'http://0.0.0.0:8192/v1')}")

使用模型: Qwen/Qwen3-8B
API Base: http://0.0.0.0:8192


# 2. 数据预处理

In [5]:
# 定义系统提示 - 标准化输出格式
system_prompt = """You are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.

To ensure consistent evaluation, please follow this strict output format:
1. First, you may analyze and reason through the samples (this part will not be evaluated)
2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}
3. Do not include any other content outside the \\boxed{} tag
4. Examples of correct format: \\boxed{1} or \\boxed{2}
5. Any output not following this format will be considered invalid"""

def create_prompt(sample1, sample2):
    """构建提示文本，包含明确的标准化输出要求"""
    return """Below are two text samples. Determine which is real and which is fake.

Sample 1: [[sample1]]

Sample 2: [[sample2]]

Please follow these steps:
1. You may first compare their content, logical consistency, information accuracy, and language style
2. Then, provide your final answer by putting only the number of the real sample (1 or 2) within \\boxed{}
3. Ensure there is no other text outside the \\boxed{} tag

Example of correct output:
After comparing the two samples, I think Sample 1 is more consistent and accurate. Answer: \\boxed{1}

Example of incorrect output:
2 (This is missing the required boxed format)""".replace("[[sample1]]", sample1).replace("[[sample2]]", sample2)

def process_sample(sample1, sample2):
    """构建OpenAI消息格式"""
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": create_prompt(sample1[:4096], sample2[:4096])}
    ]
    return messages

In [6]:
# 转为datasets格式
data = datasets.Dataset.from_pandas(df_test)
data[0]

{'file_1': '"Music" Music music music Music music Music music Music music\nThe two telescopes using "Music" have been incredibly busy since their launch! They\'re incredibly popular for research on Earthly objects like star clusters or celestial bodies like planets or even galaxies far away! They\'ve produced many scientific publications within just a few years - so many so that they dominate by far when it comes to peer reviewed articles published from those telescopes! These musical journey has produced over a hundred articles published through various outlets like journals such as \'Nature\'. Some notable achievements include discovering near space objects found between star systems as well as identifying cosmic events such as how gravity affects light waves traveling through spacemusic\nBeyond its contributions to individual research projects with specific scientific goals (like finding new types or patterns) , it also contributes significantly to our overall understanding by being

In [7]:
processed_data = data.map(lambda x: {"messages": process_sample(x["file_1"], x["file_2"])}, remove_columns=["file_1", "file_2"])
# 展示数据
print("示例消息:")
print(processed_data[0]["messages"])

Map: 100%|██████████| 1068/1068 [00:00<00:00, 9977.30 examples/s] 

示例消息:
[{'content': 'You are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.\n\nTo ensure consistent evaluation, please follow this strict output format:\n1. First, you may analyze and reason through the samples (this part will not be evaluated)\n2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}\n3. Do not include any other content outside the \\boxed{} tag\n4. Examples of correct format: \\boxed{1} or \\boxed{2}\n5. Any output not following this format will be considered invalid', 'role': 'system'}, {'content': 'Below are two text samples. Determine which is real and which is fake.\n\nSample 1: "Music" Music music music Music music Music music Music music\nThe two telescopes using "Music" have been incredibly busy since their launch! They\'re incredibly popu




# 推理与保存

In [8]:
# OpenAI推理函数，带重试机制
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(5))
def call_openai_with_retry(messages, model=MODEL_NAME, max_tokens=MAX_TOKENS, temperature=TEMPERATURE):
    """
    使用重试机制调用OpenAI API
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"API调用失败: {e}")
        raise e

def generate_response(messages):
    """生成单个响应（无自一致性）"""
    return call_openai_with_retry(messages)

In [9]:
# 提取答案的函数（从生成文本中提取\boxed{1}或\boxed{2}）
def extract_answer(generated_text):
    """从文本中提取\\boxed{1}或\\boxed{2}，无效则返回None"""
    match = re.search(r'\\boxed\{(1|2)\}', generated_text)
    if match:
        answer = match.group(1)
    else:
        match = re.search(r'\{([12])\}', generated_text)
        if match:
            answer = match.group(1)
        elif "1" in generated_text[-25:] and "2" not in generated_text[-25:]:
            answer = '1'
        elif "1" not in generated_text[-25:] and "2" in generated_text[-25:]:
            answer = '2'
        else:  
            print(f"警告: 无效的判断结果 - {generated_text[-50:]}...，默认返回1")
            answer = '1'
    return answer

In [10]:
# 检查是否有之前的结果文件，如果有则从中断处继续
baseline_results_file = "data/baseline_results.csv"
if os.path.exists(baseline_results_file):
    pred_data = pd.read_csv(baseline_results_file)
    print(f"找到现有结果文件，已完成 {len(pred_data)} 条记录")
else:
    pred_data = pd.DataFrame(columns=['id', 'real_text_id'])
    print("未找到现有结果文件，从头开始")

找到现有结果文件，已完成 0 条记录


In [11]:
if len(pred_data) > 0:
    idx = pred_data.iloc[-1, 0]
    print(f"从 id={idx} 后继续")
else:
    idx = -1
    print("从头开始处理")

从头开始处理


In [12]:
# 截取idx后面的，不包括idx
processed_data = processed_data.select(range(idx+1, len(processed_data)))

In [13]:
processed_data[0]

{'id': 0,
 'messages': [{'content': 'You are tasked with judging the authenticity of two text samples. Given two texts labeled 1 and 2, one is real and the other is fake. Please think step by step, but only keep a minimum draft for each thinking step, with 5 words at most.\n\nTo ensure consistent evaluation, please follow this strict output format:\n1. First, you may analyze and reason through the samples (this part will not be evaluated)\n2. For your final answer, you must wrap the number of the real sample (only 1 or 2) within \\boxed{}\n3. Do not include any other content outside the \\boxed{} tag\n4. Examples of correct format: \\boxed{1} or \\boxed{2}\n5. Any output not following this format will be considered invalid',
   'role': 'system'},
  {'content': 'Below are two text samples. Determine which is real and which is fake.\n\nSample 1: "Music" Music music music Music music Music music Music music\nThe two telescopes using "Music" have been incredibly busy since their launch! Th

In [14]:
# 处理并即时保存（使用OpenAI API，无自一致性）
def process_and_save_baseline(processed_data, output_file="data/baseline_results.csv"):
    """使用OpenAI API进行单次推理并保存结果"""
    # 初始化CSV文件
    if not os.path.exists(output_file):
        pd.DataFrame(columns=['id', 'real_text_id']).to_csv(
            output_file, index=False, encoding='utf-8'
        )
    
    # 从中断处继续
    start_idx = idx + 1
    
    for i in range(start_idx, len(processed_data)):
        try:
            # 获取当前样本的消息
            messages = processed_data[i]["messages"]
            
            # 调用OpenAI API生成单个响应
            generated_text = generate_response(messages)
            
            # 提取答案
            final_answer = extract_answer(generated_text)
            
            # 即时保存
            pd.DataFrame([{
                'id': i,
                'real_text_id': final_answer
            }]).to_csv(
                output_file,
                mode='a',
                header=False,
                index=False,
                encoding='utf-8'
            )
            
            if i % 10 == 0:
                print(f"已保存id={i}的结果：{final_answer}")
            
        except Exception as e:
            print(f"处理id={i}时出错: {e}")
            # 保存错误信息并继续
            pd.DataFrame([{
                'id': i,
                'real_text_id': '1'  # 默认值
            }]).to_csv(
                output_file,
                mode='a',
                header=False,
                index=False,
                encoding='utf-8'
            )
            continue

# 使用示例
process_and_save_baseline(processed_data, "data/baseline_results.csv")

已保存id=0的结果：2
已保存id=10的结果：1
已保存id=10的结果：1
已保存id=20的结果：2
已保存id=20的结果：2
已保存id=30的结果：2
已保存id=30的结果：2
已保存id=40的结果：2
已保存id=40的结果：2
已保存id=50的结果：1
已保存id=50的结果：1
已保存id=60的结果：1
已保存id=60的结果：1
已保存id=70的结果：1
已保存id=70的结果：1
已保存id=80的结果：2
已保存id=80的结果：2
已保存id=90的结果：2
已保存id=90的结果：2
已保存id=100的结果：2
已保存id=100的结果：2
已保存id=110的结果：2
已保存id=110的结果：2
已保存id=120的结果：2
已保存id=120的结果：2
已保存id=130的结果：1
已保存id=130的结果：1
已保存id=140的结果：1
已保存id=140的结果：1
警告: 无效的判断结果 - m stuck. I'll have to choose one. Maybe the answer...，默认返回1
警告: 无效的判断结果 - m stuck. I'll have to choose one. Maybe the answer...，默认返回1
已保存id=150的结果：2
已保存id=150的结果：2
已保存id=160的结果：2
已保存id=160的结果：2
已保存id=170的结果：1
已保存id=170的结果：1
已保存id=180的结果：1
已保存id=180的结果：1
已保存id=190的结果：2
已保存id=190的结果：2
已保存id=200的结果：1
已保存id=200的结果：1
已保存id=210的结果：1
已保存id=210的结果：1
已保存id=220的结果：1
已保存id=220的结果：1
已保存id=230的结果：1
已保存id=230的结果：1
已保存id=240的结果：2
已保存id=240的结果：2
已保存id=250的结果：1
已保存id=250的结果：1
已保存id=260的结果：1
已保存id=260的结果：1
已保存id=270的结果：1
已保存id=270的结果：1
已保存id=280的结果：1
已保存id=280的结果：1
已保存id=290的结果：1
已保