# 提取所有题目信息

In [1]:
import json
import pandas as pd

In [2]:
LANGUAGE_LIST = ["python"]
SUFFIX_MAP = {
    "python": "py",
    "javascript": "js",
    "typescript": "ts",
    "c&cpp": "cpp",
    "java": "java"
}


def generate_python_prompt(code_signature, code_type):
    if code_type == "method":
        return f"please write a python function , the function signature as below {code_signature}"
    elif code_type == "class":
        return f"please write a python class , the class signature as below {code_signature}"


def generate_javascript_prompt(code_signature, code_type):
    if code_type == "method":
        return f"please write a javascript function , the function signature as below {code_signature}"
    elif code_type == "class":
        return f"please write a javascript class , the class signature as below {code_signature}"


def generate_typescript_prompt(code_signature, code_type):
    if code_type == "method":
        return f"please write a typescript function , the function signature as below {code_signature}"
    elif code_type == "class":
        return f"please write a typescript class , the class signature as below {code_signature}"


def generate_ccpp_prompt(code_signature, code_type):
    if code_type == "method":
        return f"please write a cpp function , the function signature as below {code_signature}"
    elif code_type == "class":
        return f"please write a cpp class , the class signature as below {code_signature}"


LANGUAGE_PROMPT_MAP = {
    "python": generate_python_prompt,
    "javascript": generate_javascript_prompt,
    "typescript": generate_typescript_prompt,
    "c&cpp": generate_ccpp_prompt
}

In [3]:
for language in LANGUAGE_LIST:
    # 读取excel表 获取数据信息
    excel_data = pd.read_excel("./xlsx/RealisticEval-Data.xlsx", sheet_name=language)
    data = excel_data[excel_data['check'] == 'yes']
    # 遍历每一行
    question_array = []
    for index, row in data.iterrows():
        task_id = int(row["task_id"])
        code_type = row["code_type"]
        dir_path = f"../all/t{task_id}"
        signature_path = f"{dir_path}/{language}/signature.{SUFFIX_MAP[language]}"
        with open(signature_path, "r", encoding="utf8") as signature_file:
            code_signature = signature_file.read()
        test_path = f"{dir_path}/{language}/test.{SUFFIX_MAP[language]}"
        with open(test_path, "r", encoding="utf8") as test_file:
            code_test = test_file.read()
        prompt = LANGUAGE_PROMPT_MAP[language](code_signature, code_type)
        question_info = {
            "task_id": task_id,
            "code_type": code_type,
            "code_language": language,
            "test_code": code_test,
            "prompt": prompt
        }
        question_array.append(question_info)
        print(f"task_id:{task_id} signature:{len(code_signature)} test:{len(code_test)}")
    with open(f"./question/{language}.json", "w", encoding="utf8") as question_file:
        json_str = json.dumps(question_array)
        question_file.write(json_str)
        question_file.flush()

task_id:1 signature:458 test:744
task_id:5 signature:362 test:1591
task_id:6 signature:316 test:661
task_id:7 signature:512 test:1048
task_id:8 signature:513 test:1001
task_id:9 signature:375 test:1022
task_id:11 signature:300 test:1239
task_id:12 signature:597 test:1665
task_id:13 signature:0 test:3520
task_id:14 signature:503 test:2310
task_id:17 signature:528 test:1746
task_id:18 signature:278 test:971
task_id:19 signature:319 test:833
task_id:20 signature:302 test:1932
task_id:21 signature:501 test:1991
task_id:22 signature:484 test:1570
task_id:23 signature:566 test:1421
task_id:24 signature:267 test:2662
task_id:25 signature:649 test:2523
task_id:26 signature:411 test:936
task_id:27 signature:277 test:2102
task_id:28 signature:457 test:1399
task_id:31 signature:460 test:918
task_id:33 signature:453 test:2351
task_id:35 signature:486 test:1067
task_id:36 signature:810 test:2157
task_id:37 signature:681 test:1801
task_id:38 signature:1179 test:1349
task_id:40 signature:350 test:116