In [2]:
import argparse
from datasets import Dataset, load_dataset, concatenate_datasets,Features,Sequence,Value
import os
from huggingface_hub import login
import sys
from argparse import Namespace
import re
import openai
import json

In [None]:
features = Features({
    "repo": Value("string"),
    "pull_number": Value("int64"),
    "test_patch": Value("string"),
    "issue_numbers": Sequence(Value("string")),
    "instance_id": Value("string"),
    "problem_statement": Value("string"),
    "version": Value("string"),
    "base_commit": Value("string"),
    "patch": Value("string"),
    "created_at": Value("string"),
    "hints_text": Value("string"),
    "environment_setup_commit": Value("string"),
    "FAIL_TO_PASS": Sequence(Value("string")),
    "PASS_TO_PASS": Sequence(Value("string")),
    "FAIL_TO_FAIL": Sequence(Value("string")),  # 显式定义为字符串数组
    "PASS_TO_FAIL": Sequence(Value("string")),  # 显式定义为字符串数组
})
args = Namespace(
    dataset_name="r1v3r/RustGPT_Bench_verified",
    split="train"
)

token = os.getenv("HUGGING_FACE_HUB_TOKEN")
if not token:
    print("Error: Hugging Face access token not provided. Use --token or set the HUGGINGFACE_TOKEN environment variable.")
    sys.exit(1)

login(token=token)

# Load the dataset
print(f"Loading dataset '{args.dataset_name}' split '{args.split}' from Hugging Face...")
try:
    dataset = load_dataset(args.dataset_name, split=args.split)
except Exception as e:
    print(f"Error loading dataset: {e}")
    sys.exit(1)

dataset =  dataset.remove_columns('r_number')
print("After removing r_number:", dataset.column_names)
dataset.cast(features)
dataset.push_to_hub(args.dataset_name, split=args.split,token=token)


In [None]:
dataset = dataset.cast(features)

hyper_dataset = load_dataset("r1v3r/hyper_validated", split="train")
hyper_dataset = hyper_dataset.cast(features)
# 筛选数据
hyper_dataset = hyper_dataset.filter(
    lambda x: x["instance_id"] == "hyperium__hyper-3261" or x["instance_id"] == "hyperium__hyper-3275"
)

# print(hyper_dataset)
serde_dataset = load_dataset("r1v3r/serde_validated", split="train",features=features)
serde_dataset = serde_dataset.cast(features)
proc_macro2_dataset = load_dataset("r1v3r/proc-macro2_validated", split="train",features=features)
proc_macro2_dataset = proc_macro2_dataset.cast(features)

In [None]:



temp = concatenate_datasets([hyper_dataset, serde_dataset, proc_macro2_dataset ,dataset])

temp.push_to_hub("r1v3r/RustGPT_Bench_100", token=token)



修改某个值

In [None]:
index = None
for i, instance in enumerate(dataset):
    if instance.get("instance_id") == "rayon-rs__rayon-986":
        index = i
        break

if index is not None:
    # Update the 'pull_number' for the found instance
    dataset = dataset.map(lambda example, idx: 
                          {**example,"pull_number": 986} if idx == index else example, 
                          with_indices=True)
    print(f"Updated 'pull_number' to 2 for instance with 'instance_id' 1.")
else:
    print("Error: No instance found with 'instance_id' == 1.")


统计buggy files

In [None]:
def count_files_in_patch(example):
    patch_text = example['patch']
    # Use regex to find all lines that indicate file changes
    file_changes = re.findall(r'(?m)^(--- |\+\+\+ )([^\s]+)', patch_text)
    # Extract unique file paths
    unique_files = set(file for _, file in file_changes)
    # Return a dictionary with instance_id and the count of unique files
    return {'instance_id': example['instance_id'], 'file_number': len(unique_files)}

# Map the function over the dataset
results = dataset.map(count_files_in_patch, remove_columns=dataset.column_names)

# Convert the results to a dictionary for easy lookup
instance_file_map = {result['instance_id']: result['file_number'] for result in results}

# Print the resulting dictionary
for instance_id, file_number in instance_file_map.items():
    print(f"{instance_id}: {file_number}")


问gpt哪些是关于feature的

In [3]:
import os
import openai
import json  # 用于保存 JSON 文件
from tqdm import tqdm
import pandas as pd

# 配置 OpenAI API
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.base_url = "https://api5.xhub.chat/v1/"
dataset = load_dataset("r1v3r/auto_0207", split="train")
# 模型配置
model_name_or_path = "gpt-4o-mini"  # 或者 "gpt-3.5-turbo"
system_messages = "You are an AI assistant that categorizes tasks into 'Bug Fix' or 'Feature Development'."
temperature = 0.5
top_p = 0.9

def classify_problem_statement(problem_statement):
    """
    使用 OpenAI API 对 problem_statement 进行分类。

    Args:
        problem_statement (str): 问题陈述。

    Returns:
        str: 分类结果，可能的值为 'Bug Fix', 'Feature Development', 'Unknown', 'Error'。
    """
    user_message = (
        f"Categorize the following problem statement as either 'Bug Fix' or 'Feature Development':\n\n\"{problem_statement}\""
    )

    try:
        response = openai.chat.completions.create(
            model=model_name_or_path,
            messages=[
                {"role": "system", "content": system_messages},
                {"role": "user", "content": user_message},
            ],
            temperature=temperature,
            top_p=top_p,
        )

        # 根据 OpenAI API 的响应格式提取分类结果
        classification = response.choices[0].message.content.strip()
        print(f"Classification: {classification}")

        # 确保分类结果为预期的值
        if 'Bug Fix' in classification:
            return 'Bug Fix'
        elif 'Feature Development' in classification:
            return 'Feature Development'
        else:
            return 'Unknown'  # 无法确定的情况

    except Exception as e:
        print(f"Error processing problem_statement: {problem_statement}\nError: {e}")
        return 'Error'

"""
主函数，用于加载数据集，分类 problem_statement,并保存结果。
"""
# 加载数据集
# 初始化一个列表，用于存储分类为 'Feature Development' 的条目
feature_development_entries = []

# 迭代数据集中的每一项
print("Classifying problem statements...")
for example in tqdm(dataset, desc="Classifying"):
    problem_statement = example.get('problem_statement', None)
    instance_id = example.get('instance_id', None)  # 确保 'instance_id' 字段存在

    if pd.isna(problem_statement):
        classification = 'Unknown'
    else:
        classification = classify_problem_statement(problem_statement)

    # 如果分类为 'Feature Development'，将相关信息添加到列表中
    if classification == 'Feature Development':
        entry = {
            "instance_id": instance_id,
            "problem_statement": problem_statement,
            "response": classification
        }
        print("instance_id:", instance_id)
        feature_development_entries.append(entry)

# 保存分类为 'Feature Development' 的条目到 JSON 文件
output_json_path = "feature_development_entries.json"  # 指定输出 JSON 文件的路径
print(f"Saving Feature Development entries to {output_json_path}...")
with open(output_json_path, 'w', encoding='utf-8') as json_file:
    json.dump(feature_development_entries, json_file, ensure_ascii=False, indent=4)

print("分类完成，结果已保存。")




Classifying: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [15:14<00:00,  1.41s/it]

Classification: This problem statement can be categorized as a **Bug Fix**. The issues described involve correcting the behavior of existing functions (`split()` and `splitn()`) that are not returning the expected results, which indicates that there are bugs in the implementation that need to be fixed. Additionally, the mention of existing tests passing incorrectly further supports this classification.
Saving Feature Development entries to feature_development_entries.json...
分类完成，结果已保存。





去除feature

In [5]:
import json

# 假设 json1 和 json2 是文件路径
json1_path = '/home/riv3r/SWE-bench/swebench/utils/merged_dataset.json'
json2_path = '/home/riv3r/SWE-bench/feature_development_entries.json'
output_jsonl_path = '/home/riv3r/SWE-bench/swebench/utils/ipynb/output.json'  # 过滤后的输出文件路径

# 加载 json2 文件的数据并提取所有的 instance_id
with open(json2_path, 'r', encoding='utf-8') as file:
    json2_data = json.load(file)

json2_instance_ids = {item['instance_id'] for item in json2_data}

# 打开 json1 文件进行逐行读取，并打开输出文件准备写入
with open(json1_path, 'r', encoding='utf-8') as infile, \
     open(output_jsonl_path, 'w', encoding='utf-8') as outfile:
    
    for line in infile:
        try:
            # 解析每一行的 JSON 数据
            item = json.loads(line.strip())
            
            # 检查当前项的 instance_id 是否不在 json2 的 instance_id 列表中
            if item.get('instance_id') not in json2_instance_ids:
                # 将符合条件的项写入输出文件
                outfile.write(json.dumps(item, ensure_ascii=False) + '\n')
                
        except json.JSONDecodeError as e:
            print(f"无法解析的 JSON 行: {line}, 错误: {e}")

print(f"Filtered data has been saved to {output_jsonl_path}")

Filtered data has been saved to /home/riv3r/SWE-bench/swebench/utils/ipynb/output.json


In [None]:
repo_id = args.dataset_name
print(f"Uploading the cleaned dataset to Hugging Face repository '{repo_id}'...")

dataset.push_to_hub(repo_id, token=token)