In [1]:
import json

def get_prompt(instruction, input):
    # 提取选项部分
    options = []
    for line in input.split('; '):
        if line.startswith('A:') or line.startswith('B:') or line.startswith('C:') or line.startswith('D:'):
            options.append(line.split(':', 1)[1])

    # 构建选项字符串
    options_str = '\n'.join(f"{'ABCD'[i]}. {o}" for i, o in enumerate(options))

    # 构建prompt
    prompt = f"""你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。请逐步分析问题并在最后一行输出答案，最后一行的格式为"A"。题目如下：

### 题目:
{instruction}

### 问题:
{input}
{options_str}
"""

    return prompt


def read_and_process_json_file(file_path):
    prompts = []

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for item in data:
        instruction = item["instruction"]
        input = item["input"]
        prompt = get_prompt(instruction, input)
        prompts.append(prompt)

    return prompts


# if __name__ == "__main__":
#     # 指定输入文件路径
#     input_file_path = './dataset/output_test.json'

#     # 读取并处理JSON文件
#     prompts = read_and_process_json_file(input_file_path)
#     print(prompts[:10])

#     # # 打印生成的prompts
#     # for i, prompt in enumerate(prompts, start=1):
#     #     print(f"Prompt {i}:{prompt}")
#     #     if i==3:
#     #         break
#     print(len(prompts))

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
from tqdm import tqdm


mode_path = '/share/new_models/qwen/Qwen2-7B-Instruct/'
# mode_path = '/root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b'
lora_path = './output/Qwen2_instruct_lora/checkpoint-100' # 这里改称你的 lora 输出对应 checkpoint 地址
# mode_path = './merge/'
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
# model = PeftModel.from_pretrained(model, model_id=lora_path)
input_file_path = './dataset/output_test.json'

    # 读取并处理JSON文件
prompts = read_and_process_json_file(input_file_path)



# prompt = "题目如下：有一个列表，找出该列表的最后一个元素。\n\n下列选项中哪个是列表 `[a, b, c, d]` 的最后一个元素？A: a; B: b; C: c; D: d"
answer_list=[]
for prompt in tqdm(prompts):       
    inputs = tokenizer.apply_chat_template([{"role": "user", "content": prompt}],
                                           add_generation_prompt=True,
                                           tokenize=True,
                                           return_tensors="pt",
                                           return_dict=True
                                           ).to('cuda')
    
    
    gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer_list.append(answer)
    # print(answer_list)
        

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  1%|          | 14/1328 [01:08<1:33:13,  4.26s/it]

In [None]:
from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
from tqdm import tqdm

from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
import nest_asyncio
 
nest_asyncio.apply()
backend_config = TurbomindEngineConfig(tp=1)

# backend_config = PytorchEngineConfig(session_len=2048,
                                     # adapters=dict(lora_name_1='chenchi/lora-chatglm2-6b-guodegang'))
gen_config = GenerationConfig(top_p=0.2,
                              top_k=1,
                              temperature=0.2,
                              max_new_tokens=1024)
pipe = pipeline('/root/share/new_models/Shanghai_AI_Laboratory/internlm2-chat-1_8b',backend_config=backend_config)
input_file_path = './dataset/output_test.json'

    # 读取并处理JSON文件
prompts = read_and_process_json_file(input_file_path)



# prompt = "题目如下：有一个列表，找出该列表的最后一个元素。\n\n下列选项中哪个是列表 `[a, b, c, d]` 的最后一个元素？A: a; B: b; C: c; D: d"
answer_list=[]
for prompt in tqdm(prompts):

    response = pipe(prompt, gen_config=gen_config)
    answer_list.append(response)
    # print(response)

                                                                            



  0%|          | 0/1328 [00:00<?, ?it/s]

In [None]:
import json


def process_json(json_file_path, answer_list, output_file_path):
    # 读取 JSON 文件
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

    # 初始化输出数据结构
    output_data = []

    # 初始化 round_id
    round_id = 0
    current_instruction = ""

    # 处理每一项数据
    for item in data:
        # 如果 instruction 发生变化，增加 round_id
        if current_instruction != item['instruction']:
            round_id += 1
            current_instruction = item['instruction']
            current_item = {
                'id': f'round1_test_data_{round_id:03d}',
                'questions': []
            }
            output_data.append(current_item)
        else:
            current_item = output_data[-1]

        # 解析 input 字段中的选择题编号
        question_number = int(item['input'].split('选择题 ')[1].split(':')[0])

        # 获取对应答案
        answer = answer_list.pop(0)  # 使用并移除第一个答案

        # 添加问题及答案到 questions 列表
        current_item['questions'].append({'answer': answer})

    # 写入 JSONL 文件
    with open(output_file_path, 'w', encoding='utf-8') as jsonl_file:
        for item in output_data:
            jsonl_file.write(json.dumps(item, ensure_ascii=False) + '\n')


In [None]:
len(answer_list)

In [None]:
 input_file_path = './dataset/output_test.json'

# 指定答案列表
answers = answer_list

# 指定输出文件路径
output_file_path = './dataset/submit.jsonl'

# 处理 JSON 文件并输出到 JSONL 文件
process_json(input_file_path, answers, output_file_path)

In [None]:
def write_list_to_file(file_path, data_list):
    with open(file_path, 'w') as file:
        for item in data_list:
            file.write(f"{item}\n")

# 假设这是你的列表
options =  answer_list

# 文件路径
file_path = 'a.txt'

# 写入文件
write_list_to_file(file_path, options)