# Extracting prompts from SWE-Bench

This notebook extracts prompts from the SWE-bench dataset to be used by humans and language models.

In [None]:
from datasets import load_dataset
import pandas as pd

# Load the 'squad' dataset
dataset = load_dataset("princeton-nlp/SWE-bench_oracle_llama", split='test')
df = pd.DataFrame(dataset)

In [None]:
# Show available repos
df["repo"].unique()

In [None]:
import os

def get_rnd_issue(df: pd.DataFrame, repo: str=None, dir: str="./prompts_oracle/") -> pd.Series:
    """
    Fetches a random issue from the given DataFrame and saves its prompt to a file in `dir`.
    """
    
    if not os.path.exists(dir):
        os.makedirs(dir)

    # If repo is specified, filter rows by the given repo name.
    if repo:
        df = df[df['repo'] == repo]
    
    # Return a random row from the filtered DataFrame.
    issue =  df.sample(n=1).iloc[0]

    text = issue['text']
    instance_id = issue['instance_id']

    # Create a filename using the "instance_id"
    filename = f"{dir}/issue_{instance_id}.txt"

    # Save the "text" to the file
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)

In [None]:
get_rnd_issue(df, repo="django/django")