In [17]:
import os
os.environ["OPENAI_API_KEY"] = (
    f"{os.getenv('OPENAI_API_KEY')}"
)

MODEL_NAME="gpt-4o"
# MODEL_NAME="o3-mini"

In [18]:
from minions.minions_code import Minions
from minions.minion import Minion
from minions.clients.ollama import OllamaClient
from minions.clients.openai import OpenAIClient

In [19]:
local_client = OllamaClient(model_name="qwen2.5-coder", temperature=0.0)
remote_client = OpenAIClient(model_name="o3-mini", temperature=0.0)
minions = Minions(local_client, remote_client)

# remote_client = OpenAIClient(model_name="gpt-4o", temperature=0.0)

# minions = Minion(local_client, remote_client)

### Run minions model for error detection

In [20]:
import csv
from tqdm import tqdm
# Define the document metadata
doc_metadata = "python code"
num_tasks_per_round = 1
num_samples_per_task = 1

task = f"""\
Identify all functions that contain any errors in the code. The errors might be logical or there might be multiple errors within one function.
Your final answer should start with a list of all functions containing errors. This should then be followed by proposed code fixes (respond with code) for each erroneous function."""

task_old = f"""\

Identify all errors in the code and provide a brief explanation of each error. The errors might be logical or there might be multiple errors in the code. Identify them all.

Your final answer should be a list of all errors and proposed code fixes (respond with code)"""

results = []
with open('swebench_data.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row
    for row in tqdm(reader):
        code_context = [row[1]]  # row[1] contains the modified/buggy code
        
        output = minions(
            task=task,
            doc_metadata=doc_metadata,
            context=code_context,
            max_rounds=5,
            num_tasks_per_round=num_tasks_per_round,
            num_samples_per_task=num_samples_per_task,
        )
        # output = minions(
        #     task=task,
        #     doc_metadata=doc_metadata,
        #     context=code_context,
        #     max_rounds=5
        # )
        # File,Buggy Code,Errors,Buggy Functions
        results.append([row[0], row[1], row[2], row[3], output['final_answer'], output['meta']])  # Store original file, buggy code, and minions output
        
# Write results to CSV
with open('minions_results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['File', 'Buggy Code', 'Errors', 'Buggy Functions', 'Model Analysis', 'Meta'])
    writer.writerows(results)

4it [09:14, 122.39s/it]

Error executing code (attempt 1 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.
Error executing code (attempt 2 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.
Error executing code (attempt 3 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.
Error executing code (attempt 5 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.
Error executing code (attempt 7 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.


5it [11:08, 119.41s/it]

Error executing code (attempt 9 of 10 max attempts): ValueError: Function prepare_jobs not found in the code block.


8it [19:02, 145.65s/it]

Error executing code (attempt 3 of 10 max attempts): SyntaxError: invalid character '’' (U+2019) (<string>, line 15)


31it [1:32:34, 146.08s/it]Error during OpenAI API call: Error code: 400 - {'error': {'message': 'Could not finish the message because max_tokens was reached. Please try again with higher max_tokens.', 'type': 'invalid_request_error', 'param': None, 'code': None}}
31it [1:33:42, 181.37s/it]


BadRequestError: Error code: 400 - {'error': {'message': 'Could not finish the message because max_tokens was reached. Please try again with higher max_tokens.', 'type': 'invalid_request_error', 'param': None, 'code': None}}

#### Evaluate performance

In [14]:
import pandas as pd

In [17]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import openai
def process_row(ground_truth, minions_text):
    """
    Process a single Python file by evaluating results using GPT.
    Returns a tuple of 5 integers: (true positives, true negatives, false positives, false negatives, total number of actual errors)
    """
    
    prompt = f"""Here is the ground truth list of functions containing errors within a codebase:
    {ground_truth}.

    You are also provided a list of the functions identified by the model:
    {minions_text}
    
    Please return a list of 5 integers in the format [a, b, c, d, e] corresponding to:
    (true positives, true negatives, false positives, false negatives, total number of ground truth functions). It is extremely important that only this list is outputted and no other text.
    """
    
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that helps evaluate error detection in code."},
            {"role": "user", "content": prompt}
        ]
    )
    
    response_text = response.choices[0].message.content
    
    return response_text

def tabulate_results(df, save_file, max_workers=5):
    """
    Process rows of the dataframe in parallel using process_row
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create futures for each row using the Buggy Functions and Minions Analysis columns
        futures = {
            executor.submit(process_row, row['Buggy Functions'], row['Model Analysis']): i 
            for i, row in df.iterrows()
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                row_idx = futures[future]
                results.append((df.iloc[row_idx]['File'], result))
            except Exception as e:
                row_idx = futures[future]
                print(f"Error processing row {row_idx}: {e}")
    
    # Save results
    results_df = pd.DataFrame(results, columns=['File', 'Evaluation'])
    results_df.to_csv(save_file, index=False)
    
    print("Completed")


In [18]:
# Read the results CSV file
df = pd.read_csv('minions_results.csv')

# Run tabulate_results on the dataframe
tabulate_results(df, 'minions_evaluations_og.csv', max_workers=8)

100%|██████████| 50/50 [01:44<00:00,  2.08s/it]

Completed





In [19]:
# Read evaluations CSV
eval_df = pd.read_csv('minions_evaluations_og.csv') # Sometimes extra whitespace in the csv causes error e.g. "[a,b,c,d,e] "

# Convert string representation of list to actual list and split into columns
eval_df[['true_positives', 'true_negatives', 'false_positives', 'false_negatives', 'total_errors']] = pd.DataFrame(
    eval_df['Evaluation'].str.strip('[]').str.split(',').tolist(), 
    index=eval_df.index
).astype(int)

# Drop original Evaluation column
eval_df = eval_df.drop('Evaluation', axis=1)

# Save updated dataframe
eval_df.to_csv(f'minions_evaluations-{MODEL_NAME}.csv', index=False)

print("Split evaluation column into separate metrics columns")
eval_df.head()


Split evaluation column into separate metrics columns


Unnamed: 0,File,true_positives,true_negatives,false_positives,false_negatives,total_errors
0,django:contrib:auth:migrations:0011_update_pro...,0,0,0,2,2
1,sympy:polys:polyoptions.py,0,0,0,10,10
2,django:forms:widgets.py,0,0,0,10,10
3,src:_pytest:python.py,1,0,0,10,11
4,src:_pytest:_io:saferepr.py,1,0,0,8,9


In [20]:
# Calculate precision, recall and F1 score
precision = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_positives'].sum())
recall = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_negatives'].sum())
f1_score = 2 * (precision * recall) / (precision + recall)
print("Minions Scores with gpt-4o (num rounds=5)")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}") 
print(f"F1 Score: {f1_score:.3f}")

Minions Scores with o3-mini (num rounds=5)
Precision: 0.449
Recall: 0.053
F1 Score: 0.095


### Run GPT model to baseline error detection

In [16]:
import csv
from tqdm import tqdm
import openai
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

def process_row(buggy_code):
    prompt_old = f"""Identify all errors in the code and provide a brief explanation of each error. The errors might be logical or there might be multiple errors in the code. Identify them all.
    Your final answer should be a list of all errors and proposed code fixes (respond with code).
    Here is the buggy code: {buggy_code}."""

    prompt = f"""Identify all functions that contain any errors in the code. The errors might be logical or there might be multiple errors within one function.
    Your final answer should start with a list of all functions containing errors. This should then be followed by proposed code fixes (respond with code) for each erroneous function.
    Here is the buggy code: {buggy_code}."""
    
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that helps recognize errors in code."},
            {"role": "user", "content": prompt}
        ]
    )
    
    response_text = response.choices[0].message.content
    
    return response_text

def overall_results(df, save_file, max_workers=5):
    """
    The input df is a .csv that has columns File, Buggy Code, Errors, Buggy Functions
    Process rows of the dataframe in parallel using process_row
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create futures for each row using the Modified Code and column
        futures = {
            executor.submit(process_row, row['Buggy Code']): i 
            for i, row in df.iterrows()
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                row_idx = futures[future]
                results.append((df.iloc[row_idx]['File'], df.iloc[row_idx]['Buggy Code'], df.iloc[row_idx]['Errors'], df.iloc[row_idx]['Buggy Functions'], result))
            except Exception as e:
                row_idx = futures[future]
                print(f"Error processing row {row_idx}: {e}")
    
    # Save results
    results_df = pd.DataFrame(results, columns=['File', 'Buggy Code', 'Errors', 'Buggy Functions', 'Model Analysis'])
    results_df.to_csv(save_file, index=False)
    
    print("Completed")

In [17]:
# Read the results CSV file
df = pd.read_csv('swebench_data.csv')

# Run overall_results on the dataframe
overall_results(df, save_file='baseline_results.csv', max_workers=8)

100%|██████████| 50/50 [03:22<00:00,  4.05s/it]

Completed





#### Evaluate performance

In [18]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_row(ground_truth, minions_text):
    """
    Process a single Python file by evaluating results using GPT.
    Returns a tuple of 5 integers: (true positives, true negatives, false positives, false negatives, total number of actual errors)
    """
    
    prompt = f"""Here is the ground truth list of functions containing errors within a codebase:
    {ground_truth}.

    You are also provided a list of the functions identified by the model:
    {minions_text}
    
    Please return a list of 5 integers in the format [a, b, c, d, e] corresponding to:
    (true positives, true negatives, false positives, false negatives, total number of ground truth functions). It is extremely important that only this list is outputted and no other text.
    """
    
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="o3-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that helps evaluate error detection in code."},
            {"role": "user", "content": prompt}
        ]
    )
    
    response_text = response.choices[0].message.content
    
    return response_text

def tabulate_results(df, save_file, max_workers=5):
    """
    Process rows of the dataframe in parallel using process_row
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Create futures for each row using the Buggy Functions and Minions Analysis columns
        futures = {
            executor.submit(process_row, row['Buggy Functions'], row['Model Analysis']): i 
            for i, row in df.iterrows()
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                result = future.result()
                row_idx = futures[future]
                results.append((df.iloc[row_idx]['File'], result))
            except Exception as e:
                row_idx = futures[future]
                print(f"Error processing row {row_idx}: {e}")
    
    # Save results
    results_df = pd.DataFrame(results, columns=['File', 'Evaluation'])
    results_df.to_csv(save_file, index=False)
    
    print("Completed")


In [19]:
# Read the results CSV file
df = pd.read_csv('baseline_results.csv')

# Run tabulate_results on the dataframe
tabulate_results(df, 'baseline_evaluations_og.csv', max_workers=8)

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [01:37<00:00,  1.95s/it]

Completed





In [20]:
# Read evaluations CSV
eval_df = pd.read_csv('baseline_evaluations_og.csv') # Sometimes extra whitespace in the csv causes error e.g. "[a,b,c,d,e] "

# Convert string representation of list to actual list and split into columns
eval_df[['true_positives', 'true_negatives', 'false_positives', 'false_negatives', 'total_errors']] = pd.DataFrame(
    eval_df['Evaluation'].str.strip('[]').str.split(',').tolist(), 
    index=eval_df.index
).astype(int)

# Drop original Evaluation column
eval_df = eval_df.drop('Evaluation', axis=1)

# Save updated dataframe
eval_df.to_csv(f'baseline_evaluations-{MODEL_NAME}.csv', index=False)

print("Split evaluation column into separate metrics columns")
eval_df.head()


Split evaluation column into separate metrics columns


Unnamed: 0,File,true_positives,true_negatives,false_positives,false_negatives,total_errors
0,src:_pytest:_io:saferepr.py,2,0,0,7,9
1,django:contrib:auth:migrations:0011_update_pro...,2,0,0,0,2
2,django:db:models:query_utils.py,3,0,0,7,10
3,django:db:migrations:autodetector.py,1,0,0,9,10
4,astropy:modeling:convolution.py,2,0,0,3,5


In [21]:
# Calculate precision, recall and F1 score
precision = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_positives'].sum())
recall = eval_df['true_positives'].sum() / (eval_df['true_positives'].sum() + eval_df['false_negatives'].sum())
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}") 
print(f"F1 Score: {f1_score:.3f}")

Precision: 0.771
Recall: 0.321
F1 Score: 0.453
